diff --git a/dimos/hardware/can_activate.sh b/dimos/hardware/can_activate.sh
new file mode 100644
index 0000000000..60cc95e7ea
--- /dev/null
+++ b/dimos/hardware/can_activate.sh
@@ -0,0 +1,138 @@
+#!/bin/bash
+
+# The default CAN name can be set by the user via command-line parameters.
+DEFAULT_CAN_NAME="${1:-can0}"
+
+# The default bitrate for a single CAN module can be set by the user via command-line parameters.
+DEFAULT_BITRATE="${2:-1000000}"
+
+# USB hardware address (optional parameter)
+USB_ADDRESS="${3}"
+echo "-------------------START-----------------------"
+# Check if ethtool is installed.
+if ! dpkg -l | grep -q "ethtool"; then
+ echo "\e[31mError: ethtool not detected in the system.\e[0m"
+ echo "Please use the following command to install ethtool:"
+ echo "sudo apt update && sudo apt install ethtool"
+ exit 1
+fi
+
+# Check if can-utils is installed.
+if ! dpkg -l | grep -q "can-utils"; then
+ echo "\e[31mError: can-utils not detected in the system.\e[0m"
+ echo "Please use the following command to install ethtool:"
+ echo "sudo apt update && sudo apt install can-utils"
+ exit 1
+fi
+
+echo "Both ethtool and can-utils are installed."
+
+# Retrieve the number of CAN modules in the current system.
+CURRENT_CAN_COUNT=$(ip link show type can | grep -c "link/can")
+
+# Verify if the number of CAN modules in the current system matches the expected value.
+if [ "$CURRENT_CAN_COUNT" -ne "1" ]; then
+ if [ -z "$USB_ADDRESS" ]; then
+ # Iterate through all CAN interfaces.
+ for iface in $(ip -br link show type can | awk '{print $1}'); do
+ # Use ethtool to retrieve bus-info.
+ BUS_INFO=$(sudo ethtool -i "$iface" | grep "bus-info" | awk '{print $2}')
+
+ if [ -z "$BUS_INFO" ];then
+ echo "Error: Unable to retrieve bus-info for interface $iface."
+ continue
+ fi
+
+ echo "Interface $iface is inserted into USB port $BUS_INFO"
+ done
+ echo -e " \e[31m Error: The number of CAN modules detected by the system ($CURRENT_CAN_COUNT) does not match the expected number (1). \e[0m"
+ echo -e " \e[31m Please add the USB hardware address parameter, such as: \e[0m"
+ echo -e " bash can_activate.sh can0 1000000 1-2:1.0"
+ echo "-------------------ERROR-----------------------"
+ exit 1
+ fi
+fi
+
+# Load the gs_usb module.
+# sudo modprobe gs_usb
+# if [ $? -ne 0 ]; then
+# echo "Error: Unable to load the gs_usb module."
+# exit 1
+# fi
+
+if [ -n "$USB_ADDRESS" ]; then
+ echo "Detected USB hardware address parameter: $USB_ADDRESS"
+
+ # Use ethtool to find the CAN interface corresponding to the USB hardware address.
+ INTERFACE_NAME=""
+ for iface in $(ip -br link show type can | awk '{print $1}'); do
+ BUS_INFO=$(sudo ethtool -i "$iface" | grep "bus-info" | awk '{print $2}')
+ if [ "$BUS_INFO" = "$USB_ADDRESS" ]; then
+ INTERFACE_NAME="$iface"
+ break
+ fi
+ done
+
+ if [ -z "$INTERFACE_NAME" ]; then
+ echo "Error: Unable to find CAN interface corresponding to USB hardware address $USB_ADDRESS."
+ exit 1
+ else
+ echo "Found the interface corresponding to USB hardware address $USB_ADDRESS: $INTERFACE_NAME."
+ fi
+else
+ # Retrieve the unique CAN interface.
+ INTERFACE_NAME=$(ip -br link show type can | awk '{print $1}')
+
+ # Check if the interface name has been retrieved.
+ if [ -z "$INTERFACE_NAME" ]; then
+ echo "Error: Unable to detect CAN interface."
+ exit 1
+ fi
+ BUS_INFO=$(sudo ethtool -i "$INTERFACE_NAME" | grep "bus-info" | awk '{print $2}')
+ echo "Expected to configure a single CAN module, detected interface $INTERFACE_NAME with corresponding USB address $BUS_INFO."
+fi
+
+# Check if the current interface is already activated.
+IS_LINK_UP=$(ip link show "$INTERFACE_NAME" | grep -q "UP" && echo "yes" || echo "no")
+
+# Retrieve the bitrate of the current interface.
+CURRENT_BITRATE=$(ip -details link show "$INTERFACE_NAME" | grep -oP 'bitrate \K\d+')
+
+if [ "$IS_LINK_UP" = "yes" ] && [ "$CURRENT_BITRATE" -eq "$DEFAULT_BITRATE" ]; then
+ echo "Interface $INTERFACE_NAME is already activated with a bitrate of $DEFAULT_BITRATE."
+
+ # Check if the interface name matches the default name.
+ if [ "$INTERFACE_NAME" != "$DEFAULT_CAN_NAME" ]; then
+ echo "Rename interface $INTERFACE_NAME to $DEFAULT_CAN_NAME."
+ sudo ip link set "$INTERFACE_NAME" down
+ sudo ip link set "$INTERFACE_NAME" name "$DEFAULT_CAN_NAME"
+ sudo ip link set "$DEFAULT_CAN_NAME" up
+ echo "The interface has been renamed to $DEFAULT_CAN_NAME and reactivated."
+ else
+ echo "The interface name is already $DEFAULT_CAN_NAME."
+ fi
+else
+ # If the interface is not activated or the bitrate is different, configure it.
+ if [ "$IS_LINK_UP" = "yes" ]; then
+ echo "Interface $INTERFACE_NAME is already activated, but the bitrate is $CURRENT_BITRATE, which does not match the set value of $DEFAULT_BITRATE."
+ else
+ echo "Interface $INTERFACE_NAME is not activated or bitrate is not set."
+ fi
+
+ # Set the interface bitrate and activate it.
+ sudo ip link set "$INTERFACE_NAME" down
+ sudo ip link set "$INTERFACE_NAME" type can bitrate $DEFAULT_BITRATE
+ sudo ip link set "$INTERFACE_NAME" up
+ echo "Interface $INTERFACE_NAME has been reset to bitrate $DEFAULT_BITRATE and activated."
+
+ # Rename the interface to the default name.
+ if [ "$INTERFACE_NAME" != "$DEFAULT_CAN_NAME" ]; then
+ echo "Rename interface $INTERFACE_NAME to $DEFAULT_CAN_NAME."
+ sudo ip link set "$INTERFACE_NAME" down
+ sudo ip link set "$INTERFACE_NAME" name "$DEFAULT_CAN_NAME"
+ sudo ip link set "$DEFAULT_CAN_NAME" up
+ echo "The interface has been renamed to $DEFAULT_CAN_NAME and reactivated."
+ fi
+fi
+
+echo "-------------------OVER------------------------"
diff --git a/dimos/hardware/piper_arm.py b/dimos/hardware/piper_arm.py
new file mode 100644
index 0000000000..774f70b1c6
--- /dev/null
+++ b/dimos/hardware/piper_arm.py
@@ -0,0 +1,517 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# dimos/hardware/piper_arm.py
+
+from typing import (
+ Optional,
+ Tuple,
+)
+from piper_sdk import * # from the official Piper SDK
+import numpy as np
+import time
+import subprocess
+import kinpy as kp
+import sys
+import termios
+import tty
+import select
+from scipy.spatial.transform import Rotation as R
+from dimos.utils.transform_utils import euler_to_quaternion, quaternion_to_euler
+from dimos.utils.logging_config import setup_logger
+
+import threading
+
+import pytest
+
+import dimos.core as core
+import dimos.protocol.service.lcmservice as lcmservice
+from dimos.core import In, Module, rpc
+from dimos_lcm.geometry_msgs import Pose, Vector3, Twist
+
+logger = setup_logger("dimos.hardware.piper_arm")
+
+
+class PiperArm:
+ def __init__(self, arm_name: str = "arm"):
+ self.arm = C_PiperInterface_V2()
+ self.arm.ConnectPort()
+ self.resetArm()
+ time.sleep(0.5)
+ self.resetArm()
+ time.sleep(0.5)
+ self.enable()
+ self.enable_gripper() # Enable gripper after arm is enabled
+ self.gotoZero()
+ time.sleep(1)
+ self.init_vel_controller()
+
+ def enable(self):
+ while not self.arm.EnablePiper():
+ pass
+ time.sleep(0.01)
+ logger.info("Arm enabled")
+ # self.arm.ModeCtrl(
+ # ctrl_mode=0x01, # CAN command mode
+ # move_mode=0x01, # “Move-J”, but ignored in MIT
+ # move_spd_rate_ctrl=100, # doesn’t matter in MIT
+ # is_mit_mode=0xAD # <-- the magic flag
+ # )
+ self.arm.MotionCtrl_2(0x01, 0x01, 80, 0xAD)
+
+ def gotoZero(self):
+ factor = 1000
+ position = [57.0, 0.0, 215.0, 0, 90.0, 0, 0]
+ X = round(position[0] * factor)
+ Y = round(position[1] * factor)
+ Z = round(position[2] * factor)
+ RX = round(position[3] * factor)
+ RY = round(position[4] * factor)
+ RZ = round(position[5] * factor)
+ joint_6 = round(position[6] * factor)
+ logger.debug(f"Going to zero position: X={X}, Y={Y}, Z={Z}, RX={RX}, RY={RY}, RZ={RZ}")
+ self.arm.MotionCtrl_2(0x01, 0x00, 100, 0x00)
+ self.arm.EndPoseCtrl(X, Y, Z, RX, RY, RZ)
+ self.arm.GripperCtrl(0, 1000, 0x01, 0)
+
+ def gotoObserve(self):
+ factor = 1000
+ position = [57.0, 0.0, 280.0, 0, 120.0, 0, 0]
+ X = round(position[0] * factor)
+ Y = round(position[1] * factor)
+ Z = round(position[2] * factor)
+ RX = round(position[3] * factor)
+ RY = round(position[4] * factor)
+ RZ = round(position[5] * factor)
+ joint_6 = round(position[6] * factor)
+ logger.debug(f"Going to zero position: X={X}, Y={Y}, Z={Z}, RX={RX}, RY={RY}, RZ={RZ}")
+ self.arm.MotionCtrl_2(0x01, 0x00, 100, 0x00)
+ self.arm.EndPoseCtrl(X, Y, Z, RX, RY, RZ)
+
+ def softStop(self):
+ self.gotoZero()
+ time.sleep(1)
+ self.arm.MotionCtrl_2(
+ 0x01,
+ 0x00,
+ 100,
+ )
+ self.arm.MotionCtrl_1(0x01, 0, 0)
+ time.sleep(3)
+
+ def cmd_ee_pose_values(self, x, y, z, r, p, y_, line_mode=False):
+ """Command end-effector to target pose in space (position + Euler angles)"""
+ factor = 1000
+ pose = [
+ x * factor * factor,
+ y * factor * factor,
+ z * factor * factor,
+ r * factor,
+ p * factor,
+ y_ * factor,
+ ]
+ self.arm.MotionCtrl_2(0x01, 0x02 if line_mode else 0x00, 100, 0x00)
+ self.arm.EndPoseCtrl(
+ int(pose[0]), int(pose[1]), int(pose[2]), int(pose[3]), int(pose[4]), int(pose[5])
+ )
+
+ def cmd_ee_pose(self, pose: Pose, line_mode=False):
+ """Command end-effector to target pose using Pose message"""
+ # Convert quaternion to euler angles
+ euler = quaternion_to_euler(pose.orientation, degrees=True)
+
+ # Command the pose
+ self.cmd_ee_pose_values(
+ pose.position.x,
+ pose.position.y,
+ pose.position.z,
+ euler.x,
+ euler.y,
+ euler.z,
+ line_mode,
+ )
+
+ def get_ee_pose(self):
+ """Return the current end-effector pose as Pose message with position in meters and quaternion orientation"""
+ pose = self.arm.GetArmEndPoseMsgs()
+ factor = 1000.0
+ # Extract individual pose values and convert to base units
+ # Position values are divided by 1000 to convert from SDK units to meters
+ # Rotation values are divided by 1000 to convert from SDK units to radians
+ x = pose.end_pose.X_axis / factor / factor # Convert mm to m
+ y = pose.end_pose.Y_axis / factor / factor # Convert mm to m
+ z = pose.end_pose.Z_axis / factor / factor # Convert mm to m
+ rx = pose.end_pose.RX_axis / factor
+ ry = pose.end_pose.RY_axis / factor
+ rz = pose.end_pose.RZ_axis / factor
+
+ # Create position vector (already in meters)
+ position = Vector3(x, y, z)
+
+ orientation = euler_to_quaternion(Vector3(rx, ry, rz), degrees=True)
+
+ return Pose(position, orientation)
+
+ def cmd_gripper_ctrl(self, position, effort=0.25):
+ """Command end-effector gripper"""
+ factor = 1000
+ position = position * factor * factor # meters
+ effort = effort * factor # N/m
+
+ self.arm.GripperCtrl(abs(round(position)), abs(round(effort)), 0x01, 0)
+ logger.debug(f"Commanding gripper position: {position}mm")
+
+ def enable_gripper(self):
+ """Enable the gripper using the initialization sequence"""
+ logger.info("Enabling gripper...")
+ while not self.arm.EnablePiper():
+ time.sleep(0.01)
+ self.arm.GripperCtrl(0, 1000, 0x02, 0)
+ self.arm.GripperCtrl(0, 1000, 0x01, 0)
+ logger.info("Gripper enabled")
+
+ def release_gripper(self):
+ """Release gripper by opening to 100mm (10cm)"""
+ logger.info("Releasing gripper (opening to 100mm)")
+ self.cmd_gripper_ctrl(0.1) # 0.1m = 100mm = 10cm
+
+ def get_gripper_feedback(self) -> Tuple[float, float]:
+ """
+ Get current gripper feedback.
+
+ Returns:
+ Tuple of (angle_degrees, effort) where:
+ - angle_degrees: Current gripper angle in degrees
+ - effort: Current gripper effort (0.0 to 1.0 range)
+ """
+ gripper_msg = self.arm.GetArmGripperMsgs()
+ angle_degrees = (
+ gripper_msg.gripper_state.grippers_angle / 1000.0
+ ) # Convert from SDK units to degrees
+ effort = gripper_msg.gripper_state.grippers_effort / 1000.0 # Convert from SDK units to N/m
+ return angle_degrees, effort
+
+ def close_gripper(self, commanded_effort: float = 0.5) -> None:
+ """
+ Close the gripper.
+
+ Args:
+ commanded_effort: Effort to use when closing gripper (default 0.25 N/m)
+ """
+ # Command gripper to close (0.0 position)
+ self.cmd_gripper_ctrl(0.0, effort=commanded_effort)
+ logger.info("Closing gripper")
+
+ def gripper_object_detected(self, commanded_effort: float = 0.25) -> bool:
+ """
+ Check if an object is detected in the gripper based on effort feedback.
+
+ Args:
+ commanded_effort: The effort that was used when closing gripper (default 0.25 N/m)
+
+ Returns:
+ True if object is detected in gripper, False otherwise
+ """
+ # Get gripper feedback
+ angle_degrees, actual_effort = self.get_gripper_feedback()
+
+ # Check if object is grasped (effort > 80% of commanded effort)
+ effort_threshold = 0.8 * commanded_effort
+ object_present = abs(actual_effort) > effort_threshold
+
+ if object_present:
+ logger.info(f"Object detected in gripper (effort: {actual_effort:.3f} N/m)")
+ else:
+ logger.info(f"No object detected (effort: {actual_effort:.3f} N/m)")
+
+ return object_present
+
+ def resetArm(self):
+ self.arm.MotionCtrl_1(0x02, 0, 0)
+ self.arm.MotionCtrl_2(0, 0, 0, 0x00)
+ logger.info("Resetting arm")
+
+ def init_vel_controller(self):
+ self.chain = kp.build_serial_chain_from_urdf(
+ open("dimos/hardware/piper_description.urdf"), "gripper_base"
+ )
+ self.J = self.chain.jacobian(np.zeros(6))
+ self.J_pinv = np.linalg.pinv(self.J)
+ self.dt = 0.01
+
+ def cmd_vel(self, x_dot, y_dot, z_dot, R_dot, P_dot, Y_dot):
+ joint_state = self.arm.GetArmJointMsgs().joint_state
+ # print(f"[PiperArm] Current Joints (direct): {joint_state}", type(joint_state))
+ joint_angles = np.array(
+ [
+ joint_state.joint_1,
+ joint_state.joint_2,
+ joint_state.joint_3,
+ joint_state.joint_4,
+ joint_state.joint_5,
+ joint_state.joint_6,
+ ]
+ )
+ # print(f"[PiperArm] Current Joints: {joint_angles}", type(joint_angles))
+ factor = 57295.7795 # 1000*180/3.1415926
+ joint_angles = joint_angles / factor # convert to radians
+
+ q = np.array(
+ [
+ joint_angles[0],
+ joint_angles[1],
+ joint_angles[2],
+ joint_angles[3],
+ joint_angles[4],
+ joint_angles[5],
+ ]
+ )
+ J = self.chain.jacobian(q)
+ self.J_pinv = np.linalg.pinv(J)
+ dq = self.J_pinv @ np.array([x_dot, y_dot, z_dot, R_dot, P_dot, Y_dot]) * self.dt
+ newq = q + dq
+
+ newq = newq * factor
+
+ self.arm.MotionCtrl_2(0x01, 0x01, 100, 0xAD)
+ self.arm.JointCtrl(
+ int(round(newq[0])),
+ int(round(newq[1])),
+ int(round(newq[2])),
+ int(round(newq[3])),
+ int(round(newq[4])),
+ int(round(newq[5])),
+ )
+ time.sleep(self.dt)
+ # print(f"[PiperArm] Moving to Joints to : {newq}")
+
+ def cmd_vel_ee(self, x_dot, y_dot, z_dot, RX_dot, PY_dot, YZ_dot):
+ factor = 1000
+ x_dot = x_dot * factor
+ y_dot = y_dot * factor
+ z_dot = z_dot * factor
+ RX_dot = RX_dot * factor
+ PY_dot = PY_dot * factor
+ YZ_dot = YZ_dot * factor
+
+ current_pose_msg = self.get_ee_pose()
+
+ # Convert quaternion to euler angles
+ quat = [
+ current_pose_msg.orientation.x,
+ current_pose_msg.orientation.y,
+ current_pose_msg.orientation.z,
+ current_pose_msg.orientation.w,
+ ]
+ rotation = R.from_quat(quat)
+ euler = rotation.as_euler("xyz") # Returns [rx, ry, rz] in radians
+
+ # Create current pose array [x, y, z, rx, ry, rz]
+ current_pose = np.array(
+ [
+ current_pose_msg.position.x,
+ current_pose_msg.position.y,
+ current_pose_msg.position.z,
+ euler[0],
+ euler[1],
+ euler[2],
+ ]
+ )
+
+ # Apply velocity increment
+ current_pose = (
+ current_pose + np.array([x_dot, y_dot, z_dot, RX_dot, PY_dot, YZ_dot]) * self.dt
+ )
+
+ self.cmd_ee_pose_values(
+ current_pose[0],
+ current_pose[1],
+ current_pose[2],
+ current_pose[3],
+ current_pose[4],
+ current_pose[5],
+ )
+ time.sleep(self.dt)
+
+ def disable(self):
+ self.softStop()
+
+ while self.arm.DisablePiper():
+ pass
+ time.sleep(0.01)
+ self.arm.DisconnectPort()
+
+
+class VelocityController(Module):
+ cmd_vel: In[Twist] = None
+
+ def __init__(self, arm, period=0.01, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.arm = arm
+ self.period = period
+ self.latest_cmd = None
+ self.last_cmd_time = None
+
+ @rpc
+ def start(self):
+ self.cmd_vel.subscribe(self.handle_cmd_vel)
+
+ def control_loop():
+ while True:
+ # Check for timeout (1 second)
+ if self.last_cmd_time and (time.time() - self.last_cmd_time) > 1.0:
+ logger.warning(
+ "No velocity command received for 1 second, stopping control loop"
+ )
+ break
+
+ cmd_vel = self.latest_cmd
+
+ joint_state = self.arm.GetArmJointMsgs().joint_state
+ # print(f"[PiperArm] Current Joints (direct): {joint_state}", type(joint_state))
+ joint_angles = np.array(
+ [
+ joint_state.joint_1,
+ joint_state.joint_2,
+ joint_state.joint_3,
+ joint_state.joint_4,
+ joint_state.joint_5,
+ joint_state.joint_6,
+ ]
+ )
+ factor = 57295.7795 # 1000*180/3.1415926
+ joint_angles = joint_angles / factor # convert to radians
+ q = np.array(
+ [
+ joint_angles[0],
+ joint_angles[1],
+ joint_angles[2],
+ joint_angles[3],
+ joint_angles[4],
+ joint_angles[5],
+ ]
+ )
+
+ J = self.chain.jacobian(q)
+ self.J_pinv = np.linalg.pinv(J)
+ dq = (
+ self.J_pinv
+ @ np.array(
+ [
+ cmd_vel.linear.X,
+ cmd_vel.linear.y,
+ cmd_vel.linear.z,
+ cmd_vel.angular.x,
+ cmd_vel.angular.y,
+ cmd_vel.angular.z,
+ ]
+ )
+ * self.dt
+ )
+ newq = q + dq
+
+ newq = newq * factor # convert radians to scaled degree units for joint control
+
+ self.arm.MotionCtrl_2(0x01, 0x01, 100, 0xAD)
+ self.arm.JointCtrl(
+ int(round(newq[0])),
+ int(round(newq[1])),
+ int(round(newq[2])),
+ int(round(newq[3])),
+ int(round(newq[4])),
+ int(round(newq[5])),
+ )
+ time.sleep(self.period)
+
+ thread = threading.Thread(target=control_loop, daemon=True)
+ thread.start()
+
+ def handle_cmd_vel(self, cmd_vel: Twist):
+ self.latest_cmd = cmd_vel
+ self.last_cmd_time = time.time()
+
+
+@pytest.mark.tool
+def run_velocity_controller():
+ lcmservice.autoconf()
+ dimos = core.start(2)
+
+ velocity_controller = dimos.deploy(VelocityController, arm=arm, period=0.01)
+ velocity_controller.cmd_vel.transport = core.LCMTransport("/cmd_vel", Twist)
+
+ velocity_controller.start()
+
+ logger.info("Velocity controller started")
+ while True:
+ time.sleep(1)
+
+
+if __name__ == "__main__":
+ arm = PiperArm()
+
+ def get_key(timeout=0.1):
+ """Non-blocking key reader for arrow keys."""
+ fd = sys.stdin.fileno()
+ old_settings = termios.tcgetattr(fd)
+ try:
+ tty.setraw(fd)
+ rlist, _, _ = select.select([fd], [], [], timeout)
+ if rlist:
+ ch1 = sys.stdin.read(1)
+ if ch1 == "\x1b": # Arrow keys start with ESC
+ ch2 = sys.stdin.read(1)
+ if ch2 == "[":
+ ch3 = sys.stdin.read(1)
+ return ch1 + ch2 + ch3
+ else:
+ return ch1
+ return None
+ finally:
+ termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
+
+ def teleop_linear_vel(arm):
+ print("Use arrow keys to control linear velocity (x/y/z). Press 'q' to quit.")
+ print("Up/Down: +x/-x, Left/Right: +y/-y, 'w'/'s': +z/-z")
+ x_dot, y_dot, z_dot = 0.0, 0.0, 0.0
+ while True:
+ key = get_key(timeout=0.1)
+ if key == "\x1b[A": # Up arrow
+ x_dot += 0.01
+ elif key == "\x1b[B": # Down arrow
+ x_dot -= 0.01
+ elif key == "\x1b[C": # Right arrow
+ y_dot += 0.01
+ elif key == "\x1b[D": # Left arrow
+ y_dot -= 0.01
+ elif key == "w":
+ z_dot += 0.01
+ elif key == "s":
+ z_dot -= 0.01
+ elif key == "q":
+ logger.info("Exiting teleop")
+ arm.disable()
+ break
+
+ # Optionally, clamp velocities to reasonable limits
+ x_dot = max(min(x_dot, 0.5), -0.5)
+ y_dot = max(min(y_dot, 0.5), -0.5)
+ z_dot = max(min(z_dot, 0.5), -0.5)
+
+ # Only linear velocities, angular set to zero
+ arm.cmd_vel_ee(x_dot, y_dot, z_dot, 0, 0, 0)
+ logger.debug(
+ f"Current linear velocity: x={x_dot:.3f} m/s, y={y_dot:.3f} m/s, z={z_dot:.3f} m/s"
+ )
+
+ run_velocity_controller()
diff --git a/dimos/hardware/piper_description.urdf b/dimos/hardware/piper_description.urdf
new file mode 100755
index 0000000000..21209b6dbb
--- /dev/null
+++ b/dimos/hardware/piper_description.urdf
@@ -0,0 +1,497 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/dimos/hardware/zed_camera.py b/dimos/hardware/zed_camera.py
index b93d2577e6..7ee2aed634 100644
--- a/dimos/hardware/zed_camera.py
+++ b/dimos/hardware/zed_camera.py
@@ -17,6 +17,10 @@
import open3d as o3d
from typing import Optional, Tuple, Dict, Any
import logging
+import time
+import threading
+from reactivex import interval
+from reactivex import operators as ops
try:
import pyzed.sl as sl
@@ -25,8 +29,17 @@
logging.warning("ZED SDK not found. Please install pyzed to use ZED camera functionality.")
from dimos.hardware.stereo_camera import StereoCamera
+from dimos.core import Module, Out, rpc
+from dimos.utils.logging_config import setup_logger
-logger = logging.getLogger(__name__)
+# Import LCM message types
+from dimos_lcm.sensor_msgs import Image
+from dimos_lcm.sensor_msgs import CameraInfo
+from dimos_lcm.geometry_msgs import PoseStamped
+from dimos_lcm.std_msgs import Header, Time
+from dimos_lcm.geometry_msgs import Pose, Point, Quaternion
+
+logger = setup_logger(__name__)
class ZEDCamera(StereoCamera):
@@ -64,6 +77,7 @@ def __init__(
self.init_params = sl.InitParameters()
self.init_params.camera_resolution = resolution
self.init_params.depth_mode = depth_mode
+ self.init_params.coordinate_system = sl.COORDINATE_SYSTEM.RIGHT_HANDED_Z_UP_X_FWD
self.init_params.coordinate_units = sl.UNIT.METER
self.init_params.camera_fps = fps
@@ -84,6 +98,12 @@ def __init__(
self.point_cloud = sl.Mat()
self.confidence_map = sl.Mat()
+ # Positional tracking
+ self.tracking_enabled = False
+ self.tracking_params = sl.PositionalTrackingParameters()
+ self.camera_pose = sl.Pose()
+ self.sensors_data = sl.SensorsData()
+
self.is_opened = False
def open(self) -> bool:
@@ -109,12 +129,160 @@ def open(self) -> bool:
logger.error(f"Error opening ZED camera: {e}")
return False
- def close(self):
- """Close the ZED camera."""
- if self.is_opened:
- self.zed.close()
- self.is_opened = False
- logger.info("ZED camera closed")
+ def enable_positional_tracking(
+ self,
+ enable_area_memory: bool = False,
+ enable_pose_smoothing: bool = True,
+ enable_imu_fusion: bool = True,
+ set_floor_as_origin: bool = False,
+ initial_world_transform: Optional[sl.Transform] = None,
+ ) -> bool:
+ """
+ Enable positional tracking on the ZED camera.
+
+ Args:
+ enable_area_memory: Enable area learning to correct tracking drift
+ enable_pose_smoothing: Enable pose smoothing
+ enable_imu_fusion: Enable IMU fusion if available
+ set_floor_as_origin: Set the floor as origin (useful for robotics)
+ initial_world_transform: Initial world transform
+
+ Returns:
+ True if tracking enabled successfully
+ """
+ if not self.is_opened:
+ logger.error("ZED camera not opened")
+ return False
+
+ try:
+ # Configure tracking parameters
+ self.tracking_params.enable_area_memory = enable_area_memory
+ self.tracking_params.enable_pose_smoothing = enable_pose_smoothing
+ self.tracking_params.enable_imu_fusion = enable_imu_fusion
+ self.tracking_params.set_floor_as_origin = set_floor_as_origin
+
+ if initial_world_transform is not None:
+ self.tracking_params.initial_world_transform = initial_world_transform
+
+ # Enable tracking
+ err = self.zed.enable_positional_tracking(self.tracking_params)
+ if err != sl.ERROR_CODE.SUCCESS:
+ logger.error(f"Failed to enable positional tracking: {err}")
+ return False
+
+ self.tracking_enabled = True
+ logger.info("Positional tracking enabled successfully")
+ return True
+
+ except Exception as e:
+ logger.error(f"Error enabling positional tracking: {e}")
+ return False
+
+ def disable_positional_tracking(self):
+ """Disable positional tracking."""
+ if self.tracking_enabled:
+ self.zed.disable_positional_tracking()
+ self.tracking_enabled = False
+ logger.info("Positional tracking disabled")
+
+ def get_pose(
+ self, reference_frame: sl.REFERENCE_FRAME = sl.REFERENCE_FRAME.WORLD
+ ) -> Optional[Dict[str, Any]]:
+ """
+ Get the current camera pose.
+
+ Args:
+ reference_frame: Reference frame (WORLD or CAMERA)
+
+ Returns:
+ Dictionary containing:
+ - position: [x, y, z] in meters
+ - rotation: [x, y, z, w] quaternion
+ - euler_angles: [roll, pitch, yaw] in radians
+ - timestamp: Pose timestamp in nanoseconds
+ - confidence: Tracking confidence (0-100)
+ - valid: Whether pose is valid
+ """
+ if not self.tracking_enabled:
+ logger.error("Positional tracking not enabled")
+ return None
+
+ try:
+ # Get current pose
+ tracking_state = self.zed.get_position(self.camera_pose, reference_frame)
+
+ if tracking_state == sl.POSITIONAL_TRACKING_STATE.OK:
+ # Extract translation
+ translation = self.camera_pose.get_translation().get()
+
+ # Extract rotation (quaternion)
+ rotation = self.camera_pose.get_orientation().get()
+
+ # Get Euler angles
+ euler = self.camera_pose.get_euler_angles()
+
+ return {
+ "position": translation.tolist(),
+ "rotation": rotation.tolist(), # [x, y, z, w]
+ "euler_angles": euler.tolist(), # [roll, pitch, yaw]
+ "timestamp": self.camera_pose.timestamp.get_nanoseconds(),
+ "confidence": self.camera_pose.pose_confidence,
+ "valid": True,
+ "tracking_state": str(tracking_state),
+ }
+ else:
+ logger.warning(f"Tracking state: {tracking_state}")
+ return {"valid": False, "tracking_state": str(tracking_state)}
+
+ except Exception as e:
+ logger.error(f"Error getting pose: {e}")
+ return None
+
+ def get_imu_data(self) -> Optional[Dict[str, Any]]:
+ """
+ Get IMU sensor data if available.
+
+ Returns:
+ Dictionary containing:
+ - orientation: IMU orientation quaternion [x, y, z, w]
+ - angular_velocity: [x, y, z] in rad/s
+ - linear_acceleration: [x, y, z] in m/s²
+ - timestamp: IMU data timestamp
+ """
+ if not self.is_opened:
+ logger.error("ZED camera not opened")
+ return None
+
+ try:
+ # Get sensors data synchronized with images
+ if (
+ self.zed.get_sensors_data(self.sensors_data, sl.TIME_REFERENCE.IMAGE)
+ == sl.ERROR_CODE.SUCCESS
+ ):
+ imu = self.sensors_data.get_imu_data()
+
+ # Get IMU orientation
+ imu_orientation = imu.get_pose().get_orientation().get()
+
+ # Get angular velocity
+ angular_vel = imu.get_angular_velocity()
+
+ # Get linear acceleration
+ linear_accel = imu.get_linear_acceleration()
+
+ return {
+ "orientation": imu_orientation.tolist(),
+ "angular_velocity": angular_vel.tolist(),
+ "linear_acceleration": linear_accel.tolist(),
+ "timestamp": self.sensors_data.timestamp.get_nanoseconds(),
+ "temperature": self.sensors_data.temperature.get(sl.SENSOR_LOCATION.IMU),
+ }
+ else:
+ return None
+
+ except Exception as e:
+ logger.error(f"Error getting IMU data: {e}")
+ return None
def capture_frame(
self,
@@ -211,6 +379,52 @@ def capture_pointcloud(self) -> Optional[o3d.geometry.PointCloud]:
logger.error(f"Error capturing point cloud: {e}")
return None
+ def capture_frame_with_pose(
+ self,
+ ) -> Tuple[
+ Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray], Optional[Dict[str, Any]]
+ ]:
+ """
+ Capture a frame with synchronized pose data.
+
+ Returns:
+ Tuple of (left_image, right_image, depth_map, pose_data)
+ """
+ if not self.is_opened:
+ logger.error("ZED camera not opened")
+ return None, None, None, None
+
+ try:
+ # Grab frame
+ if self.zed.grab(self.runtime_params) == sl.ERROR_CODE.SUCCESS:
+ # Get images and depth
+ left_img, right_img, depth = self.capture_frame()
+
+ # Get synchronized pose if tracking is enabled
+ pose_data = None
+ if self.tracking_enabled:
+ pose_data = self.get_pose()
+
+ return left_img, right_img, depth, pose_data
+ else:
+ logger.warning("Failed to grab frame from ZED camera")
+ return None, None, None, None
+
+ except Exception as e:
+ logger.error(f"Error capturing frame with pose: {e}")
+ return None, None, None, None
+
+ def close(self):
+ """Close the ZED camera."""
+ if self.is_opened:
+ # Disable tracking if enabled
+ if self.tracking_enabled:
+ self.disable_positional_tracking()
+
+ self.zed.close()
+ self.is_opened = False
+ logger.info("ZED camera closed")
+
def get_camera_info(self) -> Dict[str, Any]:
"""Get ZED camera information and calibration parameters."""
if not self.is_opened:
@@ -311,3 +525,344 @@ def __enter__(self):
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit."""
self.close()
+
+
+class ZEDModule(Module):
+ """
+ Dask module for ZED camera that publishes sensor data via LCM.
+
+ Publishes:
+ - /zed/color_image: RGB camera images
+ - /zed/depth_image: Depth images
+ - /zed/camera_info: Camera calibration information
+ - /zed/pose: Camera pose (if tracking enabled)
+ """
+
+ # Define LCM outputs
+ color_image: Out[Image] = None
+ depth_image: Out[Image] = None
+ camera_info: Out[CameraInfo] = None
+ pose: Out[PoseStamped] = None
+
+ def __init__(
+ self,
+ camera_id: int = 0,
+ resolution: str = "HD720",
+ depth_mode: str = "NEURAL",
+ fps: int = 30,
+ enable_tracking: bool = True,
+ enable_imu_fusion: bool = True,
+ set_floor_as_origin: bool = True,
+ publish_rate: float = 30.0,
+ frame_id: str = "zed_camera",
+ **kwargs,
+ ):
+ """
+ Initialize ZED Module.
+
+ Args:
+ camera_id: Camera ID (0 for first ZED)
+ resolution: Resolution string ("HD720", "HD1080", "HD2K", "VGA")
+ depth_mode: Depth mode string ("NEURAL", "ULTRA", "QUALITY", "PERFORMANCE")
+ fps: Camera frame rate
+ enable_tracking: Enable positional tracking
+ enable_imu_fusion: Enable IMU fusion for tracking
+ set_floor_as_origin: Set floor as origin for tracking
+ publish_rate: Rate to publish messages (Hz)
+ frame_id: TF frame ID for messages
+ """
+ super().__init__(**kwargs)
+
+ self.camera_id = camera_id
+ self.fps = fps
+ self.enable_tracking = enable_tracking
+ self.enable_imu_fusion = enable_imu_fusion
+ self.set_floor_as_origin = set_floor_as_origin
+ self.publish_rate = publish_rate
+ self.frame_id = frame_id
+
+ # Convert string parameters to ZED enums
+ self.resolution = getattr(sl.RESOLUTION, resolution, sl.RESOLUTION.HD720)
+ self.depth_mode = getattr(sl.DEPTH_MODE, depth_mode, sl.DEPTH_MODE.NEURAL)
+
+ # Internal state
+ self.zed_camera = None
+ self._running = False
+ self._subscription = None
+ self._sequence = 0
+
+ logger.info(f"ZEDModule initialized for camera {camera_id}")
+
+ @rpc
+ def start(self):
+ """Start the ZED module and begin publishing data."""
+ if self._running:
+ logger.warning("ZED module already running")
+ return
+
+ try:
+ # Initialize ZED camera
+ self.zed_camera = ZEDCamera(
+ camera_id=self.camera_id,
+ resolution=self.resolution,
+ depth_mode=self.depth_mode,
+ fps=self.fps,
+ )
+
+ # Open camera
+ if not self.zed_camera.open():
+ logger.error("Failed to open ZED camera")
+ return
+
+ # Enable tracking if requested
+ if self.enable_tracking:
+ success = self.zed_camera.enable_positional_tracking(
+ enable_imu_fusion=self.enable_imu_fusion,
+ set_floor_as_origin=self.set_floor_as_origin,
+ enable_pose_smoothing=True,
+ enable_area_memory=True,
+ )
+ if not success:
+ logger.warning("Failed to enable positional tracking")
+ self.enable_tracking = False
+
+ # Publish camera info once at startup
+ self._publish_camera_info()
+
+ # Start periodic frame capture and publishing
+ self._running = True
+ publish_interval = 1.0 / self.publish_rate
+
+ self._subscription = interval(publish_interval).subscribe(
+ lambda _: self._capture_and_publish()
+ )
+
+ logger.info(f"ZED module started, publishing at {self.publish_rate} Hz")
+
+ except Exception as e:
+ logger.error(f"Error starting ZED module: {e}")
+ self._running = False
+
+ @rpc
+ def stop(self):
+ """Stop the ZED module."""
+ if not self._running:
+ return
+
+ self._running = False
+
+ # Stop subscription
+ if self._subscription:
+ self._subscription.dispose()
+ self._subscription = None
+
+ # Close camera
+ if self.zed_camera:
+ self.zed_camera.close()
+ self.zed_camera = None
+
+ logger.info("ZED module stopped")
+
+ def _capture_and_publish(self):
+ """Capture frame and publish all data."""
+ if not self._running or not self.zed_camera:
+ return
+
+ try:
+ # Capture frame with pose
+ left_img, _, depth, pose_data = self.zed_camera.capture_frame_with_pose()
+
+ if left_img is None or depth is None:
+ return
+
+ # Get timestamp
+ timestamp_ns = time.time_ns()
+ timestamp = Time(sec=timestamp_ns // 1_000_000_000, nsec=timestamp_ns % 1_000_000_000)
+
+ # Create header
+ header = Header(seq=self._sequence, stamp=timestamp, frame_id=self.frame_id)
+ self._sequence += 1
+
+ # Publish color image
+ self._publish_color_image(left_img, header)
+
+ # Publish depth image
+ self._publish_depth_image(depth, header)
+
+ # Publish camera info periodically
+ self._publish_camera_info()
+
+ # Publish pose if tracking enabled and valid
+ if self.enable_tracking and pose_data and pose_data.get("valid", False):
+ self._publish_pose(pose_data, header)
+
+ except Exception as e:
+ logger.error(f"Error in capture and publish: {e}")
+
+ def _publish_color_image(self, image: np.ndarray, header: Header):
+ """Publish color image as LCM message."""
+ try:
+ # Convert BGR to RGB if needed
+ if len(image.shape) == 3 and image.shape[2] == 3:
+ image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+ else:
+ image_rgb = image
+
+ # Create LCM Image message
+ height, width = image_rgb.shape[:2]
+ encoding = "rgb8" if len(image_rgb.shape) == 3 else "mono8"
+ step = width * (3 if len(image_rgb.shape) == 3 else 1)
+ data = image_rgb.tobytes()
+
+ msg = Image(
+ data_length=len(data),
+ header=header,
+ height=height,
+ width=width,
+ encoding=encoding,
+ is_bigendian=0,
+ step=step,
+ data=data,
+ )
+
+ self.color_image.publish(msg)
+
+ except Exception as e:
+ logger.error(f"Error publishing color image: {e}")
+
+ def _publish_depth_image(self, depth: np.ndarray, header: Header):
+ """Publish depth image as LCM message."""
+ try:
+ # Depth is float32 in meters
+ height, width = depth.shape[:2]
+ encoding = "32FC1" # 32-bit float, single channel
+ step = width * 4 # 4 bytes per float
+ data = depth.astype(np.float32).tobytes()
+
+ msg = Image(
+ data_length=len(data),
+ header=header,
+ height=height,
+ width=width,
+ encoding=encoding,
+ is_bigendian=0,
+ step=step,
+ data=data,
+ )
+
+ self.depth_image.publish(msg)
+
+ except Exception as e:
+ logger.error(f"Error publishing depth image: {e}")
+
+ def _publish_camera_info(self):
+ """Publish camera calibration information."""
+ try:
+ info = self.zed_camera.get_camera_info()
+ if not info:
+ return
+
+ # Get calibration parameters
+ left_cam = info.get("left_cam", {})
+ resolution = info.get("resolution", {})
+
+ # Create CameraInfo message
+ header = Header(seq=0, stamp=Time(sec=int(time.time()), nsec=0), frame_id=self.frame_id)
+
+ # Create camera matrix K (3x3)
+ K = [
+ left_cam.get("fx", 0),
+ 0,
+ left_cam.get("cx", 0),
+ 0,
+ left_cam.get("fy", 0),
+ left_cam.get("cy", 0),
+ 0,
+ 0,
+ 1,
+ ]
+
+ # Distortion coefficients
+ D = [
+ left_cam.get("k1", 0),
+ left_cam.get("k2", 0),
+ left_cam.get("p1", 0),
+ left_cam.get("p2", 0),
+ left_cam.get("k3", 0),
+ ]
+
+ # Identity rotation matrix
+ R = [1, 0, 0, 0, 1, 0, 0, 0, 1]
+
+ # Projection matrix P (3x4)
+ P = [
+ left_cam.get("fx", 0),
+ 0,
+ left_cam.get("cx", 0),
+ 0,
+ 0,
+ left_cam.get("fy", 0),
+ left_cam.get("cy", 0),
+ 0,
+ 0,
+ 0,
+ 1,
+ 0,
+ ]
+
+ msg = CameraInfo(
+ D_length=len(D),
+ header=header,
+ height=resolution.get("height", 0),
+ width=resolution.get("width", 0),
+ distortion_model="plumb_bob",
+ D=D,
+ K=K,
+ R=R,
+ P=P,
+ binning_x=0,
+ binning_y=0,
+ )
+
+ self.camera_info.publish(msg)
+
+ except Exception as e:
+ logger.error(f"Error publishing camera info: {e}")
+
+ def _publish_pose(self, pose_data: Dict[str, Any], header: Header):
+ """Publish camera pose as PoseStamped message."""
+ try:
+ position = pose_data.get("position", [0, 0, 0])
+ rotation = pose_data.get("rotation", [0, 0, 0, 1]) # quaternion [x,y,z,w]
+
+ # Create Pose message
+ pose = Pose(
+ position=Point(x=position[0], y=position[1], z=position[2]),
+ orientation=Quaternion(x=rotation[0], y=rotation[1], z=rotation[2], w=rotation[3]),
+ )
+
+ # Create PoseStamped message
+ msg = PoseStamped(header=header, pose=pose)
+
+ self.pose.publish(msg)
+
+ except Exception as e:
+ logger.error(f"Error publishing pose: {e}")
+
+ @rpc
+ def get_camera_info(self) -> Dict[str, Any]:
+ """Get camera information and calibration parameters."""
+ if self.zed_camera:
+ return self.zed_camera.get_camera_info()
+ return {}
+
+ @rpc
+ def get_pose(self) -> Optional[Dict[str, Any]]:
+ """Get current camera pose if tracking is enabled."""
+ if self.zed_camera and self.enable_tracking:
+ return self.zed_camera.get_pose()
+ return None
+
+ def cleanup(self):
+ """Clean up resources on module destruction."""
+ self.stop()
diff --git a/dimos/perception/manip_aio_pipeline.py b/dimos/manipulation/manip_aio_pipeline.py
similarity index 100%
rename from dimos/perception/manip_aio_pipeline.py
rename to dimos/manipulation/manip_aio_pipeline.py
diff --git a/dimos/perception/manip_aio_processer.py b/dimos/manipulation/manip_aio_processer.py
similarity index 100%
rename from dimos/perception/manip_aio_processer.py
rename to dimos/manipulation/manip_aio_processer.py
diff --git a/dimos/manipulation/visual_servoing/detection3d.py b/dimos/manipulation/visual_servoing/detection3d.py
new file mode 100644
index 0000000000..887fd023ab
--- /dev/null
+++ b/dimos/manipulation/visual_servoing/detection3d.py
@@ -0,0 +1,300 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Real-time 3D object detection processor that extracts object poses from RGB-D data.
+"""
+
+from typing import List, Optional, Tuple
+import numpy as np
+import cv2
+
+from dimos.utils.logging_config import setup_logger
+from dimos.perception.segmentation.sam_2d_seg import Sam2DSegmenter
+from dimos.perception.pointcloud.utils import extract_centroids_from_masks
+from dimos.perception.detection2d.utils import calculate_object_size_from_bbox
+from dimos.perception.common.utils import bbox2d_to_corners
+
+from dimos_lcm.geometry_msgs import Pose, Vector3, Quaternion, Point
+from dimos_lcm.vision_msgs import (
+ Detection3D,
+ Detection3DArray,
+ BoundingBox3D,
+ ObjectHypothesisWithPose,
+ ObjectHypothesis,
+ Detection2D,
+ Detection2DArray,
+ BoundingBox2D,
+ Pose2D,
+ Point2D,
+)
+from dimos_lcm.std_msgs import Header
+from dimos.manipulation.visual_servoing.utils import (
+ estimate_object_depth,
+ visualize_detections_3d,
+ transform_pose,
+)
+
+logger = setup_logger("dimos.manipulation.visual_servoing.detection3d")
+
+
+class Detection3DProcessor:
+ """
+ Real-time 3D detection processor optimized for speed.
+
+ Uses Sam (FastSAM) for segmentation and mask generation, then extracts
+ 3D centroids from depth data.
+ """
+
+ def __init__(
+ self,
+ camera_intrinsics: List[float], # [fx, fy, cx, cy]
+ min_confidence: float = 0.6,
+ min_points: int = 30,
+ max_depth: float = 1.0,
+ max_object_size: float = 0.15,
+ ):
+ """
+ Initialize the real-time 3D detection processor.
+
+ Args:
+ camera_intrinsics: [fx, fy, cx, cy] camera parameters
+ min_confidence: Minimum detection confidence threshold
+ min_points: Minimum 3D points required for valid detection
+ max_depth: Maximum valid depth in meters
+ """
+ self.camera_intrinsics = camera_intrinsics
+ self.min_points = min_points
+ self.max_depth = max_depth
+ self.max_object_size = max_object_size
+
+ # Initialize Sam segmenter with tracking enabled but analysis disabled
+ self.detector = Sam2DSegmenter(
+ use_tracker=False,
+ use_analyzer=False,
+ use_filtering=True,
+ )
+
+ self.min_confidence = min_confidence
+
+ logger.info(
+ f"Initialized Detection3DProcessor with Sam segmenter, confidence={min_confidence}, "
+ f"min_points={min_points}, max_depth={max_depth}m, max_object_size={max_object_size}m"
+ )
+
+ def process_frame(
+ self, rgb_image: np.ndarray, depth_image: np.ndarray, transform: Optional[np.ndarray] = None
+ ) -> Tuple[Detection3DArray, Detection2DArray]:
+ """
+ Process a single RGB-D frame to extract 3D object detections.
+
+ Args:
+ rgb_image: RGB image (H, W, 3)
+ depth_image: Depth image (H, W) in meters
+ transform: Optional 4x4 transformation matrix to transform objects from camera frame to desired frame
+
+ Returns:
+ Tuple of (Detection3DArray, Detection2DArray) with 3D and 2D information
+ """
+
+ # Convert RGB to BGR for Sam (OpenCV format)
+ bgr_image = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
+
+ # Run Sam segmentation with tracking
+ masks, bboxes, track_ids, probs, names = self.detector.process_image(bgr_image)
+
+ if not masks or len(masks) == 0:
+ return Detection3DArray(
+ detections_length=0, header=Header(), detections=[]
+ ), Detection2DArray(detections_length=0, header=Header(), detections=[])
+
+ # Convert CUDA tensors to numpy arrays if needed
+ numpy_masks = []
+ for mask in masks:
+ if hasattr(mask, "cpu"): # PyTorch tensor
+ numpy_masks.append(mask.cpu().numpy())
+ else: # Already numpy array
+ numpy_masks.append(mask)
+
+ # Extract 3D centroids from masks
+ poses = extract_centroids_from_masks(
+ rgb_image=rgb_image,
+ depth_image=depth_image,
+ masks=numpy_masks,
+ camera_intrinsics=self.camera_intrinsics,
+ )
+
+ detections_3d = []
+ detections_2d = []
+ pose_dict = {p["mask_idx"]: p for p in poses if p["centroid"][2] < self.max_depth}
+
+ for i, (bbox, name, prob, track_id) in enumerate(zip(bboxes, names, probs, track_ids)):
+ if i not in pose_dict:
+ continue
+
+ pose = pose_dict[i]
+ obj_cam_pos = pose["centroid"]
+
+ if obj_cam_pos[2] > self.max_depth:
+ continue
+
+ # Calculate object size from bbox and depth
+ width_m, height_m = calculate_object_size_from_bbox(
+ bbox, obj_cam_pos[2], self.camera_intrinsics
+ )
+
+ # Calculate depth dimension using segmentation mask
+ depth_m = estimate_object_depth(
+ depth_image, numpy_masks[i] if i < len(numpy_masks) else None, bbox
+ )
+
+ size_x = max(width_m, 0.01) # Minimum 1cm width
+ size_y = max(height_m, 0.01) # Minimum 1cm height
+ size_z = max(depth_m, 0.01) # Minimum 1cm depth
+
+ if min(size_x, size_y, size_z) > self.max_object_size:
+ continue
+
+ # Transform to desired frame if transform matrix is provided
+ if transform is not None:
+ # Get orientation as euler angles, default to no rotation if not available
+ obj_cam_orientation = pose.get(
+ "rotation", np.array([0.0, 0.0, 0.0])
+ ) # Default to no rotation
+ transformed_pose = transform_pose(
+ obj_cam_pos, obj_cam_orientation, transform, to_robot=True
+ )
+ center_pose = transformed_pose
+ else:
+ # If no transform, use camera coordinates
+ center_pose = Pose(
+ Point(obj_cam_pos[0], obj_cam_pos[1], obj_cam_pos[2]),
+ Quaternion(0.0, 0.0, 0.0, 1.0), # Default orientation
+ )
+
+ # Create Detection3D object
+ detection = Detection3D(
+ results_length=1,
+ header=Header(), # Empty header
+ results=[
+ ObjectHypothesisWithPose(
+ hypothesis=ObjectHypothesis(class_id=name, score=float(prob))
+ )
+ ],
+ bbox=BoundingBox3D(center=center_pose, size=Vector3(size_x, size_y, size_z)),
+ id=str(track_id),
+ )
+
+ detections_3d.append(detection)
+
+ # Create corresponding Detection2D
+ x1, y1, x2, y2 = bbox
+ center_x = (x1 + x2) / 2.0
+ center_y = (y1 + y2) / 2.0
+ width = x2 - x1
+ height = y2 - y1
+
+ detection_2d = Detection2D(
+ results_length=1,
+ header=Header(),
+ results=[
+ ObjectHypothesisWithPose(
+ hypothesis=ObjectHypothesis(class_id=name, score=float(prob))
+ )
+ ],
+ bbox=BoundingBox2D(
+ center=Pose2D(position=Point2D(center_x, center_y), theta=0.0),
+ size_x=float(width),
+ size_y=float(height),
+ ),
+ id=str(track_id),
+ )
+ detections_2d.append(detection_2d)
+
+ # Create and return both arrays
+ return (
+ Detection3DArray(
+ detections_length=len(detections_3d), header=Header(), detections=detections_3d
+ ),
+ Detection2DArray(
+ detections_length=len(detections_2d), header=Header(), detections=detections_2d
+ ),
+ )
+
+ def visualize_detections(
+ self,
+ rgb_image: np.ndarray,
+ detections_3d: List[Detection3D],
+ detections_2d: List[Detection2D],
+ show_coordinates: bool = True,
+ ) -> np.ndarray:
+ """
+ Visualize detections with 3D position overlay next to bounding boxes.
+
+ Args:
+ rgb_image: Original RGB image
+ detections_3d: List of Detection3D objects
+ detections_2d: List of Detection2D objects (must be 1:1 correspondence)
+ show_coordinates: Whether to show 3D coordinates
+
+ Returns:
+ Visualization image
+ """
+ # Extract 2D bboxes from Detection2D objects
+
+ bboxes_2d = []
+ for det_2d in detections_2d:
+ if det_2d.bbox:
+ x1, y1, x2, y2 = bbox2d_to_corners(det_2d.bbox)
+ bboxes_2d.append([x1, y1, x2, y2])
+
+ return visualize_detections_3d(rgb_image, detections_3d, show_coordinates, bboxes_2d)
+
+ def get_closest_detection(
+ self, detections: List[Detection3D], class_filter: Optional[str] = None
+ ) -> Optional[Detection3D]:
+ """
+ Get the closest detection with valid 3D data.
+
+ Args:
+ detections: List of Detection3D objects
+ class_filter: Optional class name to filter by
+
+ Returns:
+ Closest Detection3D or None
+ """
+ valid_detections = []
+ for d in detections:
+ # Check if has valid bbox center position
+ if d.bbox and d.bbox.center and d.bbox.center.position:
+ # Check class filter if specified
+ if class_filter is None or (
+ d.results_length > 0 and d.results[0].hypothesis.class_id == class_filter
+ ):
+ valid_detections.append(d)
+
+ if not valid_detections:
+ return None
+
+ # Sort by depth (Z coordinate)
+ def get_z_coord(d):
+ return abs(d.bbox.center.position.z)
+
+ return min(valid_detections, key=get_z_coord)
+
+ def cleanup(self):
+ """Clean up resources."""
+ if hasattr(self.detector, "cleanup"):
+ self.detector.cleanup()
+ logger.info("Detection3DProcessor cleaned up")
diff --git a/dimos/manipulation/visual_servoing/manipulation_module.py b/dimos/manipulation/visual_servoing/manipulation_module.py
new file mode 100644
index 0000000000..b2724fb59a
--- /dev/null
+++ b/dimos/manipulation/visual_servoing/manipulation_module.py
@@ -0,0 +1,961 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Manipulation module for robotic grasping with visual servoing.
+Handles grasping logic, state machine, and hardware coordination as a Dimos module.
+"""
+
+import cv2
+import time
+import threading
+from typing import Optional, Tuple, Any, Dict
+from enum import Enum
+from collections import deque
+
+import numpy as np
+
+from dimos.core import Module, In, Out, rpc
+from dimos_lcm.sensor_msgs import Image, CameraInfo
+from dimos_lcm.geometry_msgs import Vector3, Pose, Point, Quaternion
+from dimos_lcm.vision_msgs import Detection3DArray, Detection2DArray
+
+from dimos.hardware.piper_arm import PiperArm
+from dimos.manipulation.visual_servoing.detection3d import Detection3DProcessor
+from dimos.manipulation.visual_servoing.pbvs import PBVS
+from dimos.perception.common.utils import find_clicked_detection
+from dimos.manipulation.visual_servoing.utils import (
+ create_manipulation_visualization,
+ select_points_from_depth,
+ transform_points_3d,
+ update_target_grasp_pose,
+ apply_grasp_distance,
+ is_target_reached,
+)
+from dimos.utils.transform_utils import (
+ pose_to_matrix,
+ matrix_to_pose,
+ create_transform_from_6dof,
+ compose_transforms,
+)
+from dimos.utils.logging_config import setup_logger
+
+logger = setup_logger("dimos.manipulation.visual_servoing.manipulation_module")
+
+
+class GraspStage(Enum):
+ """Enum for different grasp stages."""
+
+ IDLE = "idle"
+ PRE_GRASP = "pre_grasp"
+ GRASP = "grasp"
+ CLOSE_AND_RETRACT = "close_and_retract"
+ PLACE = "place"
+ RETRACT = "retract"
+
+
+class Feedback:
+ """Feedback data containing state information about the manipulation process."""
+
+ def __init__(
+ self,
+ grasp_stage: GraspStage,
+ target_tracked: bool,
+ current_executed_pose: Optional[Pose] = None,
+ current_ee_pose: Optional[Pose] = None,
+ current_camera_pose: Optional[Pose] = None,
+ target_pose: Optional[Pose] = None,
+ waiting_for_reach: bool = False,
+ success: Optional[bool] = None,
+ ):
+ self.grasp_stage = grasp_stage
+ self.target_tracked = target_tracked
+ self.current_executed_pose = current_executed_pose
+ self.current_ee_pose = current_ee_pose
+ self.current_camera_pose = current_camera_pose
+ self.target_pose = target_pose
+ self.waiting_for_reach = waiting_for_reach
+ self.success = success
+
+
+class ManipulationModule(Module):
+ """
+ Manipulation module for visual servoing and grasping.
+
+ Subscribes to:
+ - ZED RGB images
+ - ZED depth images
+ - ZED camera info
+
+ Publishes:
+ - Visualization images
+
+ RPC methods:
+ - handle_keyboard_command: Process keyboard input
+ - pick_and_place: Execute pick and place task
+ """
+
+ # LCM inputs
+ rgb_image: In[Image] = None
+ depth_image: In[Image] = None
+ camera_info: In[CameraInfo] = None
+
+ # LCM outputs
+ viz_image: Out[Image] = None
+
+ def __init__(
+ self,
+ ee_to_camera_6dof: Optional[list] = None,
+ **kwargs,
+ ):
+ """
+ Initialize manipulation module.
+
+ Args:
+ ee_to_camera_6dof: EE to camera transform [x, y, z, rx, ry, rz] in meters and radians
+ workspace_min_radius: Minimum workspace radius in meters
+ workspace_max_radius: Maximum workspace radius in meters
+ min_grasp_pitch_degrees: Minimum grasp pitch angle (at max radius)
+ max_grasp_pitch_degrees: Maximum grasp pitch angle (at min radius)
+ """
+ super().__init__(**kwargs)
+
+ self.arm = PiperArm()
+
+ if ee_to_camera_6dof is None:
+ ee_to_camera_6dof = [-0.065, 0.03, -0.095, 0.0, -1.57, 0.0]
+ pos = Vector3(ee_to_camera_6dof[0], ee_to_camera_6dof[1], ee_to_camera_6dof[2])
+ rot = Vector3(ee_to_camera_6dof[3], ee_to_camera_6dof[4], ee_to_camera_6dof[5])
+ self.T_ee_to_camera = create_transform_from_6dof(pos, rot)
+
+ self.camera_intrinsics = None
+ self.detector = None
+ self.pbvs = None
+
+ # Control state
+ self.last_valid_target = None
+ self.waiting_for_reach = False
+ self.current_executed_pose = None # Track the actual pose sent to arm
+ self.target_updated = False
+ self.waiting_start_time = None
+ self.reach_pose_timeout = 20.0
+
+ # Grasp parameters
+ self.grasp_width_offset = 0.03
+ self.pregrasp_distance = 0.25
+ self.grasp_distance_range = 0.03
+ self.grasp_close_delay = 2.0
+ self.grasp_reached_time = None
+ self.gripper_max_opening = 0.07
+
+ # Workspace limits and dynamic pitch parameters
+ self.workspace_min_radius = 0.2
+ self.workspace_max_radius = 0.75
+ self.min_grasp_pitch_degrees = 5.0
+ self.max_grasp_pitch_degrees = 60.0
+
+ # Grasp stage tracking
+ self.grasp_stage = GraspStage.IDLE
+
+ # Pose stabilization tracking
+ self.pose_history_size = 4
+ self.pose_stabilization_threshold = 0.01
+ self.stabilization_timeout = 25.0
+ self.stabilization_start_time = None
+ self.reached_poses = deque(maxlen=self.pose_history_size)
+ self.adjustment_count = 0
+
+ # Pose reachability tracking
+ self.ee_pose_history = deque(maxlen=20) # Keep history of EE poses
+ self.stuck_pose_threshold = 0.001 # 1mm movement threshold
+ self.stuck_pose_adjustment_degrees = 5.0
+ self.stuck_count = 0
+ self.max_stuck_reattempts = 7
+
+ # State for visualization
+ self.current_visualization = None
+ self.last_detection_3d_array = None
+ self.last_detection_2d_array = None
+
+ # Grasp result and task tracking
+ self.pick_success = None
+ self.final_pregrasp_pose = None
+ self.task_failed = False
+ self.overall_success = None
+
+ # Task control
+ self.task_running = False
+ self.task_thread = None
+ self.stop_event = threading.Event()
+
+ # Latest sensor data
+ self.latest_rgb = None
+ self.latest_depth = None
+ self.latest_camera_info = None
+
+ # Target selection
+ self.target_click = None
+
+ # Place target position and object info
+ self.home_pose = Pose(Point(0.0, 0.0, 0.0), Quaternion(0.0, 0.0, 0.0, 1.0))
+ self.place_target_position = None
+ self.target_object_height = None
+ self.retract_distance = 0.12
+ self.place_pose = None
+ self.retract_pose = None
+ self.arm.gotoObserve()
+
+ @rpc
+ def start(self):
+ """Start the manipulation module."""
+ # Subscribe to camera data
+ self.rgb_image.subscribe(self._on_rgb_image)
+ self.depth_image.subscribe(self._on_depth_image)
+ self.camera_info.subscribe(self._on_camera_info)
+
+ logger.info("Manipulation module started")
+
+ @rpc
+ def stop(self):
+ """Stop the manipulation module."""
+ # Stop any running task
+ self.stop_event.set()
+ if self.task_thread and self.task_thread.is_alive():
+ self.task_thread.join(timeout=5.0)
+
+ self.reset_to_idle()
+ logger.info("Manipulation module stopped")
+
+ def _on_rgb_image(self, msg: Image):
+ """Handle RGB image messages."""
+ try:
+ data = np.frombuffer(msg.data, dtype=np.uint8)
+ if msg.encoding == "rgb8":
+ self.latest_rgb = data.reshape((msg.height, msg.width, 3))
+ else:
+ logger.warning(f"Unsupported RGB encoding: {msg.encoding}")
+ except Exception as e:
+ logger.error(f"Error processing RGB image: {e}")
+
+ def _on_depth_image(self, msg: Image):
+ """Handle depth image messages."""
+ try:
+ if msg.encoding == "32FC1":
+ data = np.frombuffer(msg.data, dtype=np.float32)
+ self.latest_depth = data.reshape((msg.height, msg.width))
+ else:
+ logger.warning(f"Unsupported depth encoding: {msg.encoding}")
+ except Exception as e:
+ logger.error(f"Error processing depth image: {e}")
+
+ def _on_camera_info(self, msg: CameraInfo):
+ """Handle camera info messages."""
+ try:
+ self.camera_intrinsics = [msg.K[0], msg.K[4], msg.K[2], msg.K[5]]
+
+ if self.detector is None:
+ self.detector = Detection3DProcessor(self.camera_intrinsics)
+ self.pbvs = PBVS()
+ logger.info("Initialized detection and PBVS processors")
+
+ self.latest_camera_info = msg
+ except Exception as e:
+ logger.error(f"Error processing camera info: {e}")
+
+ @rpc
+ def get_single_rgb_frame(self) -> Optional[np.ndarray]:
+ """
+ get the latest rgb frame from the camera
+ """
+ return self.latest_rgb
+
+ @rpc
+ def handle_keyboard_command(self, key: str) -> str:
+ """
+ Handle keyboard commands for robot control.
+
+ Args:
+ key: Keyboard key as string
+
+ Returns:
+ Action taken as string, or empty string if no action
+ """
+ key_code = ord(key) if len(key) == 1 else int(key)
+
+ if key_code == ord("r"):
+ self.stop_event.set()
+ self.task_running = False
+ self.reset_to_idle()
+ return "reset"
+ elif key_code == ord("s"):
+ logger.info("SOFT STOP - Emergency stopping robot!")
+ self.arm.softStop()
+ self.stop_event.set()
+ self.task_running = False
+ return "stop"
+ elif key_code == ord(" ") and self.pbvs and self.pbvs.target_grasp_pose:
+ if self.grasp_stage == GraspStage.PRE_GRASP:
+ self.set_grasp_stage(GraspStage.GRASP)
+ logger.info("Executing target pose")
+ return "execute"
+ elif key_code == ord("g"):
+ logger.info("Opening gripper")
+ self.arm.release_gripper()
+ return "release"
+
+ return ""
+
+ @rpc
+ def pick_and_place(
+ self, target_x: int = None, target_y: int = None, place_x: int = None, place_y: int = None
+ ) -> Dict[str, Any]:
+ """
+ Start a pick and place task.
+
+ Args:
+ target_x: Optional X coordinate of target object
+ target_y: Optional Y coordinate of target object
+ place_x: Optional X coordinate of place location
+ place_y: Optional Y coordinate of place location
+
+ Returns:
+ Dict with status and message
+ """
+ if self.task_running:
+ return {"status": "error", "message": "Task already running"}
+
+ if self.camera_intrinsics is None:
+ return {"status": "error", "message": "Camera not initialized"}
+
+ if target_x is not None and target_y is not None:
+ self.target_click = (target_x, target_y)
+ if place_x is not None and self.latest_depth is not None:
+ points_3d_camera = select_points_from_depth(
+ self.latest_depth,
+ (place_x, place_y),
+ self.camera_intrinsics,
+ radius=10,
+ )
+
+ if points_3d_camera.size > 0:
+ ee_pose = self.arm.get_ee_pose()
+ ee_transform = pose_to_matrix(ee_pose)
+ camera_transform = compose_transforms(ee_transform, self.T_ee_to_camera)
+
+ points_3d_world = transform_points_3d(
+ points_3d_camera,
+ camera_transform,
+ to_robot=True,
+ )
+
+ place_position = np.mean(points_3d_world, axis=0)
+ self.place_target_position = place_position
+ logger.info(
+ f"Place target set at position: ({place_position[0]:.3f}, {place_position[1]:.3f}, {place_position[2]:.3f})"
+ )
+ else:
+ logger.warning("No valid depth points found at place location")
+ self.place_target_position = None
+ else:
+ self.place_target_position = None
+
+ self.task_failed = False
+ self.stop_event.clear()
+
+ if self.task_thread and self.task_thread.is_alive():
+ self.stop_event.set()
+ self.task_thread.join(timeout=1.0)
+ self.task_thread = threading.Thread(target=self._run_pick_and_place, daemon=True)
+ self.task_thread.start()
+
+ return {"status": "started", "message": "Pick and place task started"}
+
+ def _run_pick_and_place(self):
+ """Run the pick and place task loop."""
+ self.task_running = True
+ logger.info("Starting pick and place task")
+
+ try:
+ while not self.stop_event.is_set():
+ if self.task_failed:
+ logger.error("Task failed, terminating pick and place")
+ self.stop_event.set()
+ break
+
+ feedback = self.update()
+ if feedback is None:
+ time.sleep(0.01)
+ continue
+
+ if feedback.success is not None:
+ if feedback.success:
+ logger.info("Pick and place completed successfully!")
+ else:
+ logger.warning("Pick and place failed")
+ self.reset_to_idle()
+ self.stop_event.set()
+ break
+
+ time.sleep(0.01)
+
+ except Exception as e:
+ logger.error(f"Error in pick and place task: {e}")
+ self.task_failed = True
+ finally:
+ self.task_running = False
+ logger.info("Pick and place task ended")
+
+ def set_grasp_stage(self, stage: GraspStage):
+ """Set the grasp stage."""
+ self.grasp_stage = stage
+ logger.info(f"Grasp stage: {stage.value}")
+
+ def calculate_dynamic_grasp_pitch(self, target_pose: Pose) -> float:
+ """
+ Calculate grasp pitch dynamically based on distance from robot base.
+ Maps workspace radius to grasp pitch angle.
+
+ Args:
+ target_pose: Target pose
+
+ Returns:
+ Grasp pitch angle in degrees
+ """
+ # Calculate 3D distance from robot base (assumes robot at origin)
+ position = target_pose.position
+ distance = np.sqrt(position.x**2 + position.y**2 + position.z**2)
+
+ # Clamp distance to workspace limits
+ distance = np.clip(distance, self.workspace_min_radius, self.workspace_max_radius)
+
+ # Linear interpolation: min_radius -> max_pitch, max_radius -> min_pitch
+ # Normalized distance (0 to 1)
+ normalized_dist = (distance - self.workspace_min_radius) / (
+ self.workspace_max_radius - self.workspace_min_radius
+ )
+
+ # Inverse mapping: closer objects need higher pitch
+ pitch_degrees = self.max_grasp_pitch_degrees - (
+ normalized_dist * (self.max_grasp_pitch_degrees - self.min_grasp_pitch_degrees)
+ )
+
+ return pitch_degrees
+
+ def check_within_workspace(self, target_pose: Pose) -> bool:
+ """
+ Check if pose is within workspace limits and log error if not.
+
+ Args:
+ target_pose: Target pose to validate
+
+ Returns:
+ True if within workspace, False otherwise
+ """
+ # Calculate 3D distance from robot base
+ position = target_pose.position
+ distance = np.sqrt(position.x**2 + position.y**2 + position.z**2)
+
+ if not (self.workspace_min_radius <= distance <= self.workspace_max_radius):
+ logger.error(
+ f"Target outside workspace limits: distance {distance:.3f}m not in [{self.workspace_min_radius:.2f}, {self.workspace_max_radius:.2f}]"
+ )
+ return False
+
+ return True
+
+ def _check_reach_timeout(self) -> Tuple[bool, float]:
+ """Check if robot has exceeded timeout while reaching pose.
+
+ Returns:
+ Tuple of (timed_out, time_elapsed)
+ """
+ if self.waiting_start_time:
+ time_elapsed = time.time() - self.waiting_start_time
+ if time_elapsed > self.reach_pose_timeout:
+ logger.warning(
+ f"Robot failed to reach pose within {self.reach_pose_timeout}s timeout"
+ )
+ self.task_failed = True
+ self.reset_to_idle()
+ return True, time_elapsed
+ return False, time_elapsed
+ return False, 0.0
+
+ def _check_if_stuck(self) -> bool:
+ """
+ Check if robot is stuck by analyzing pose history.
+
+ Returns:
+ Tuple of (is_stuck, max_std_dev_mm)
+ """
+ if len(self.ee_pose_history) < self.ee_pose_history.maxlen:
+ return False
+
+ # Extract positions from pose history
+ positions = np.array(
+ [[p.position.x, p.position.y, p.position.z] for p in self.ee_pose_history]
+ )
+
+ # Calculate standard deviation of positions
+ std_devs = np.std(positions, axis=0)
+ # Check if all standard deviations are below stuck threshold
+ is_stuck = np.all(std_devs < self.stuck_pose_threshold)
+
+ return is_stuck
+
+ def check_reach_and_adjust(self) -> bool:
+ """
+ Check if robot has reached the current executed pose while waiting.
+ Handles timeout internally by failing the task.
+ Also detects if the robot is stuck (not moving towards target).
+
+ Returns:
+ True if reached, False if still waiting or not in waiting state
+ """
+ if not self.waiting_for_reach or not self.current_executed_pose:
+ return False
+
+ # Get current end-effector pose
+ ee_pose = self.arm.get_ee_pose()
+ target_pose = self.current_executed_pose
+
+ # Check for timeout - this will fail task and reset if timeout occurred
+ timed_out, time_elapsed = self._check_reach_timeout()
+ if timed_out:
+ return False
+
+ self.ee_pose_history.append(ee_pose)
+
+ # Check if robot is stuck
+ is_stuck = self._check_if_stuck()
+ if is_stuck:
+ if self.grasp_stage == GraspStage.RETRACT or self.grasp_stage == GraspStage.PLACE:
+ self.waiting_for_reach = False
+ self.waiting_start_time = None
+ self.stuck_count = 0
+ self.ee_pose_history.clear()
+ return True
+ self.stuck_count += 1
+ pitch_degrees = self.calculate_dynamic_grasp_pitch(target_pose)
+ if self.stuck_count % 2 == 0:
+ pitch_degrees += self.stuck_pose_adjustment_degrees * (1 + self.stuck_count // 2)
+ else:
+ pitch_degrees -= self.stuck_pose_adjustment_degrees * (1 + self.stuck_count // 2)
+
+ pitch_degrees = max(
+ self.min_grasp_pitch_degrees, min(self.max_grasp_pitch_degrees, pitch_degrees)
+ )
+ updated_target_pose = update_target_grasp_pose(target_pose, ee_pose, 0.0, pitch_degrees)
+ self.arm.cmd_ee_pose(updated_target_pose)
+ self.current_executed_pose = updated_target_pose
+ self.ee_pose_history.clear()
+ self.waiting_for_reach = True
+ self.waiting_start_time = time.time()
+ return False
+
+ if self.stuck_count >= self.max_stuck_reattempts:
+ self.task_failed = True
+ self.reset_to_idle()
+ return False
+
+ if is_target_reached(target_pose, ee_pose, self.pbvs.target_tolerance):
+ self.waiting_for_reach = False
+ self.waiting_start_time = None
+ self.stuck_count = 0
+ self.ee_pose_history.clear()
+ return True
+ return False
+
+ def _update_tracking(self, detection_3d_array: Optional[Detection3DArray]) -> bool:
+ """Update tracking with new detections."""
+ if not detection_3d_array or not self.pbvs:
+ return False
+
+ target_tracked = self.pbvs.update_tracking(detection_3d_array)
+ if target_tracked:
+ self.target_updated = True
+ self.last_valid_target = self.pbvs.get_current_target()
+ return target_tracked
+
+ def reset_to_idle(self):
+ """Reset the manipulation system to IDLE state."""
+ if self.pbvs:
+ self.pbvs.clear_target()
+ self.grasp_stage = GraspStage.IDLE
+ self.reached_poses.clear()
+ self.ee_pose_history.clear()
+ self.adjustment_count = 0
+ self.waiting_for_reach = False
+ self.current_executed_pose = None
+ self.target_updated = False
+ self.stabilization_start_time = None
+ self.grasp_reached_time = None
+ self.waiting_start_time = None
+ self.pick_success = None
+ self.final_pregrasp_pose = None
+ self.overall_success = None
+ self.place_pose = None
+ self.retract_pose = None
+ self.stuck_count = 0
+
+ self.arm.gotoObserve()
+
+ def execute_idle(self):
+ """Execute idle stage."""
+ pass
+
+ def execute_pre_grasp(self):
+ """Execute pre-grasp stage: visual servoing to pre-grasp position."""
+ if self.waiting_for_reach:
+ if self.check_reach_and_adjust():
+ self.reached_poses.append(self.current_executed_pose)
+ self.target_updated = False
+ time.sleep(0.2)
+ return
+ if (
+ self.stabilization_start_time
+ and (time.time() - self.stabilization_start_time) > self.stabilization_timeout
+ ):
+ logger.warning(
+ f"Failed to get stable grasp after {self.stabilization_timeout} seconds, resetting"
+ )
+ self.task_failed = True
+ self.reset_to_idle()
+ return
+
+ ee_pose = self.arm.get_ee_pose()
+ dynamic_pitch = self.calculate_dynamic_grasp_pitch(self.pbvs.current_target.bbox.center)
+
+ _, _, _, has_target, target_pose = self.pbvs.compute_control(
+ ee_pose, self.pregrasp_distance, dynamic_pitch
+ )
+ if target_pose and has_target:
+ # Validate target pose is within workspace
+ if not self.check_within_workspace(target_pose):
+ self.task_failed = True
+ self.reset_to_idle()
+ return
+
+ if self.check_target_stabilized():
+ logger.info("Target stabilized, transitioning to GRASP")
+ self.final_pregrasp_pose = self.current_executed_pose
+ self.grasp_stage = GraspStage.GRASP
+ self.adjustment_count = 0
+ self.waiting_for_reach = False
+ elif not self.waiting_for_reach and self.target_updated:
+ self.arm.cmd_ee_pose(target_pose)
+ self.current_executed_pose = target_pose
+ self.waiting_for_reach = True
+ self.waiting_start_time = time.time()
+ self.target_updated = False
+ self.adjustment_count += 1
+ time.sleep(0.2)
+
+ def execute_grasp(self):
+ """Execute grasp stage: move to final grasp position."""
+ if self.waiting_for_reach:
+ if self.check_reach_and_adjust() and not self.grasp_reached_time:
+ self.grasp_reached_time = time.time()
+ return
+
+ if self.grasp_reached_time:
+ if (time.time() - self.grasp_reached_time) >= self.grasp_close_delay:
+ logger.info("Grasp delay completed, closing gripper")
+ self.grasp_stage = GraspStage.CLOSE_AND_RETRACT
+ return
+
+ if self.last_valid_target:
+ # Calculate dynamic pitch for current target
+ dynamic_pitch = self.calculate_dynamic_grasp_pitch(self.last_valid_target.bbox.center)
+ normalized_pitch = dynamic_pitch / 90.0
+ grasp_distance = -self.grasp_distance_range + (
+ 2 * self.grasp_distance_range * normalized_pitch
+ )
+
+ ee_pose = self.arm.get_ee_pose()
+ _, _, _, has_target, target_pose = self.pbvs.compute_control(
+ ee_pose, grasp_distance, dynamic_pitch
+ )
+
+ if target_pose and has_target:
+ # Validate grasp pose is within workspace
+ if not self.check_within_workspace(target_pose):
+ self.task_failed = True
+ self.reset_to_idle()
+ return
+
+ object_width = self.last_valid_target.bbox.size.x
+ gripper_opening = max(
+ 0.005, min(object_width + self.grasp_width_offset, self.gripper_max_opening)
+ )
+
+ logger.info(f"Executing grasp: gripper={gripper_opening * 1000:.1f}mm")
+ self.arm.cmd_gripper_ctrl(gripper_opening)
+ self.arm.cmd_ee_pose(target_pose, line_mode=True)
+ self.current_executed_pose = target_pose
+ self.waiting_for_reach = True
+ self.waiting_start_time = time.time()
+
+ def execute_close_and_retract(self):
+ """Execute the retraction sequence after gripper has been closed."""
+ if self.waiting_for_reach and self.final_pregrasp_pose:
+ if self.check_reach_and_adjust():
+ logger.info("Reached pre-grasp retraction position")
+ self.pick_success = self.arm.gripper_object_detected()
+ if self.pick_success:
+ logger.info("Object successfully grasped!")
+ if self.place_target_position is not None:
+ logger.info("Transitioning to PLACE stage")
+ self.grasp_stage = GraspStage.PLACE
+ else:
+ self.overall_success = True
+ else:
+ logger.warning("No object detected in gripper")
+ self.task_failed = True
+ self.overall_success = False
+ return
+ if not self.waiting_for_reach:
+ logger.info("Retracting to pre-grasp position")
+ self.arm.cmd_ee_pose(self.final_pregrasp_pose, line_mode=True)
+ self.current_executed_pose = self.final_pregrasp_pose
+ self.arm.close_gripper()
+ self.waiting_for_reach = True
+ self.waiting_start_time = time.time()
+
+ def execute_place(self):
+ """Execute place stage: move to place position and release object."""
+ if self.waiting_for_reach:
+ # Use the already executed pose instead of recalculating
+ if self.check_reach_and_adjust():
+ logger.info("Reached place position, releasing gripper")
+ self.arm.release_gripper()
+ time.sleep(1.0)
+ self.place_pose = self.current_executed_pose
+ logger.info("Transitioning to RETRACT stage")
+ self.grasp_stage = GraspStage.RETRACT
+ return
+
+ if not self.waiting_for_reach:
+ place_pose = self.get_place_target_pose()
+ if place_pose:
+ logger.info("Moving to place position")
+ self.arm.cmd_ee_pose(place_pose, line_mode=True)
+ self.current_executed_pose = place_pose
+ self.waiting_for_reach = True
+ self.waiting_start_time = time.time()
+ else:
+ logger.error("Failed to get place target pose")
+ self.task_failed = True
+ self.overall_success = False
+
+ def execute_retract(self):
+ """Execute retract stage: retract from place position."""
+ if self.waiting_for_reach and self.retract_pose:
+ if self.check_reach_and_adjust():
+ logger.info("Reached retract position")
+ logger.info("Returning to observe position")
+ self.arm.gotoObserve()
+ self.arm.close_gripper()
+ self.overall_success = True
+ logger.info("Pick and place completed successfully!")
+ return
+
+ if not self.waiting_for_reach:
+ if self.place_pose:
+ pose_pitch = self.calculate_dynamic_grasp_pitch(self.place_pose)
+ self.retract_pose = update_target_grasp_pose(
+ self.place_pose, self.home_pose, self.retract_distance, pose_pitch
+ )
+ logger.info("Retracting from place position")
+ self.arm.cmd_ee_pose(self.retract_pose, line_mode=True)
+ self.current_executed_pose = self.retract_pose
+ self.waiting_for_reach = True
+ self.waiting_start_time = time.time()
+ else:
+ logger.error("No place pose stored for retraction")
+ self.task_failed = True
+ self.overall_success = False
+
+ def capture_and_process(
+ self,
+ ) -> Tuple[
+ Optional[np.ndarray], Optional[Detection3DArray], Optional[Detection2DArray], Optional[Pose]
+ ]:
+ """Capture frame from camera data and process detections."""
+ if self.latest_rgb is None or self.latest_depth is None or self.detector is None:
+ return None, None, None, None
+
+ ee_pose = self.arm.get_ee_pose()
+ ee_transform = pose_to_matrix(ee_pose)
+ camera_transform = compose_transforms(ee_transform, self.T_ee_to_camera)
+ camera_pose = matrix_to_pose(camera_transform)
+ detection_3d_array, detection_2d_array = self.detector.process_frame(
+ self.latest_rgb, self.latest_depth, camera_transform
+ )
+
+ return self.latest_rgb, detection_3d_array, detection_2d_array, camera_pose
+
+ def pick_target(self, x: int, y: int) -> bool:
+ """Select a target object at the given pixel coordinates."""
+ if not self.last_detection_2d_array or not self.last_detection_3d_array:
+ logger.warning("No detections available for target selection")
+ return False
+
+ clicked_3d = find_clicked_detection(
+ (x, y), self.last_detection_2d_array.detections, self.last_detection_3d_array.detections
+ )
+ if clicked_3d and self.pbvs:
+ # Validate workspace
+ if not self.check_within_workspace(clicked_3d.bbox.center):
+ self.task_failed = True
+ return False
+
+ self.pbvs.set_target(clicked_3d)
+
+ if clicked_3d.bbox and clicked_3d.bbox.size:
+ self.target_object_height = clicked_3d.bbox.size.z
+ logger.info(f"Target object height: {self.target_object_height:.3f}m")
+
+ position = clicked_3d.bbox.center.position
+ logger.info(
+ f"Target selected: ID={clicked_3d.id}, pos=({position.x:.3f}, {position.y:.3f}, {position.z:.3f})"
+ )
+ self.grasp_stage = GraspStage.PRE_GRASP
+ self.reached_poses.clear()
+ self.adjustment_count = 0
+ self.waiting_for_reach = False
+ self.current_executed_pose = None
+ self.stabilization_start_time = time.time()
+ return True
+ return False
+
+ def update(self) -> Optional[Dict[str, Any]]:
+ """Main update function that handles capture, processing, control, and visualization."""
+ rgb, detection_3d_array, detection_2d_array, camera_pose = self.capture_and_process()
+ if rgb is None:
+ return None
+
+ self.last_detection_3d_array = detection_3d_array
+ self.last_detection_2d_array = detection_2d_array
+ if self.target_click:
+ x, y = self.target_click
+ if self.pick_target(x, y):
+ self.target_click = None
+
+ if (
+ detection_3d_array
+ and self.grasp_stage in [GraspStage.PRE_GRASP, GraspStage.GRASP]
+ and not self.waiting_for_reach
+ ):
+ self._update_tracking(detection_3d_array)
+ stage_handlers = {
+ GraspStage.IDLE: self.execute_idle,
+ GraspStage.PRE_GRASP: self.execute_pre_grasp,
+ GraspStage.GRASP: self.execute_grasp,
+ GraspStage.CLOSE_AND_RETRACT: self.execute_close_and_retract,
+ GraspStage.PLACE: self.execute_place,
+ GraspStage.RETRACT: self.execute_retract,
+ }
+ if self.grasp_stage in stage_handlers:
+ stage_handlers[self.grasp_stage]()
+
+ target_tracked = self.pbvs.get_current_target() is not None if self.pbvs else False
+ ee_pose = self.arm.get_ee_pose()
+ feedback = Feedback(
+ grasp_stage=self.grasp_stage,
+ target_tracked=target_tracked,
+ current_executed_pose=self.current_executed_pose,
+ current_ee_pose=ee_pose,
+ current_camera_pose=camera_pose,
+ target_pose=self.pbvs.target_grasp_pose if self.pbvs else None,
+ waiting_for_reach=self.waiting_for_reach,
+ success=self.overall_success,
+ )
+
+ if self.task_running:
+ self.current_visualization = create_manipulation_visualization(
+ rgb, feedback, detection_3d_array, detection_2d_array
+ )
+
+ if self.current_visualization is not None:
+ self._publish_visualization(self.current_visualization)
+
+ return feedback
+
+ def _publish_visualization(self, viz_image: np.ndarray):
+ """Publish visualization image to LCM."""
+ try:
+ viz_rgb = cv2.cvtColor(viz_image, cv2.COLOR_BGR2RGB)
+ height, width = viz_rgb.shape[:2]
+ data = viz_rgb.tobytes()
+
+ msg = Image(
+ data_length=len(data),
+ height=height,
+ width=width,
+ encoding="rgb8",
+ is_bigendian=0,
+ step=width * 3,
+ data=data,
+ )
+
+ self.viz_image.publish(msg)
+ except Exception as e:
+ logger.error(f"Error publishing visualization: {e}")
+
+ def check_target_stabilized(self) -> bool:
+ """Check if the commanded poses have stabilized."""
+ if len(self.reached_poses) < self.reached_poses.maxlen:
+ return False
+
+ positions = np.array(
+ [[p.position.x, p.position.y, p.position.z] for p in self.reached_poses]
+ )
+ std_devs = np.std(positions, axis=0)
+ return np.all(std_devs < self.pose_stabilization_threshold)
+
+ def get_place_target_pose(self) -> Optional[Pose]:
+ """Get the place target pose with z-offset applied based on object height."""
+ if self.place_target_position is None:
+ return None
+
+ place_pos = self.place_target_position.copy()
+ if self.target_object_height is not None:
+ z_offset = self.target_object_height / 2.0
+ place_pos[2] += z_offset + 0.1
+
+ place_center_pose = Pose(
+ Point(place_pos[0], place_pos[1], place_pos[2]), Quaternion(0.0, 0.0, 0.0, 1.0)
+ )
+
+ ee_pose = self.arm.get_ee_pose()
+
+ # Calculate dynamic pitch for place position
+ dynamic_pitch = self.calculate_dynamic_grasp_pitch(place_center_pose)
+
+ place_pose = update_target_grasp_pose(
+ place_center_pose,
+ ee_pose,
+ grasp_distance=0.0,
+ grasp_pitch_degrees=dynamic_pitch,
+ )
+
+ return place_pose
+
+ @rpc
+ def cleanup(self):
+ """Clean up resources on module destruction."""
+ if self.detector and hasattr(self.detector, "cleanup"):
+ self.detector.cleanup()
+ self.arm.disable()
diff --git a/dimos/manipulation/visual_servoing/pbvs.py b/dimos/manipulation/visual_servoing/pbvs.py
new file mode 100644
index 0000000000..e34ec94557
--- /dev/null
+++ b/dimos/manipulation/visual_servoing/pbvs.py
@@ -0,0 +1,490 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Position-Based Visual Servoing (PBVS) system for robotic manipulation.
+Supports both eye-in-hand and eye-to-hand configurations.
+"""
+
+import numpy as np
+from typing import Optional, Tuple, List
+from collections import deque
+from scipy.spatial.transform import Rotation as R
+from dimos_lcm.geometry_msgs import Pose, Vector3, Quaternion, Point
+from dimos_lcm.vision_msgs import Detection3D, Detection3DArray
+from dimos.utils.logging_config import setup_logger
+from dimos.manipulation.visual_servoing.utils import (
+ update_target_grasp_pose,
+ find_best_object_match,
+ create_pbvs_visualization,
+ is_target_reached,
+)
+
+logger = setup_logger("dimos.manipulation.pbvs")
+
+
+class PBVS:
+ """
+ High-level Position-Based Visual Servoing orchestrator.
+
+ Handles:
+ - Object tracking and target management
+ - Pregrasp distance computation
+ - Grasp pose generation
+ - Coordination with low-level controller
+
+ Note: This class is agnostic to camera mounting (eye-in-hand vs eye-to-hand).
+ The caller is responsible for providing appropriate camera and EE poses.
+ """
+
+ def __init__(
+ self,
+ position_gain: float = 0.5,
+ rotation_gain: float = 0.3,
+ max_velocity: float = 0.1, # m/s
+ max_angular_velocity: float = 0.5, # rad/s
+ target_tolerance: float = 0.01, # 1cm
+ max_tracking_distance_threshold: float = 0.12, # Max distance for target tracking (m)
+ min_size_similarity: float = 0.6, # Min size similarity threshold (0.0-1.0)
+ direct_ee_control: bool = True, # If True, output target poses instead of velocities
+ ):
+ """
+ Initialize PBVS system.
+
+ Args:
+ position_gain: Proportional gain for position control
+ rotation_gain: Proportional gain for rotation control
+ max_velocity: Maximum linear velocity command magnitude (m/s)
+ max_angular_velocity: Maximum angular velocity command magnitude (rad/s)
+ target_tolerance: Distance threshold for considering target reached (m)
+ max_tracking_distance: Maximum distance for valid target tracking (m)
+ min_size_similarity: Minimum size similarity for valid target tracking (0.0-1.0)
+ direct_ee_control: If True, output target poses instead of velocity commands
+ """
+ # Initialize low-level controller only if not in direct control mode
+ if not direct_ee_control:
+ self.controller = PBVSController(
+ position_gain=position_gain,
+ rotation_gain=rotation_gain,
+ max_velocity=max_velocity,
+ max_angular_velocity=max_angular_velocity,
+ target_tolerance=target_tolerance,
+ )
+ else:
+ self.controller = None
+
+ # Store parameters for direct mode error computation
+ self.target_tolerance = target_tolerance
+
+ # Target tracking parameters
+ self.max_tracking_distance_threshold = max_tracking_distance_threshold
+ self.min_size_similarity = min_size_similarity
+ self.direct_ee_control = direct_ee_control
+
+ # Target state
+ self.current_target = None
+ self.target_grasp_pose = None
+
+ # Detection history for robust tracking
+ self.detection_history_size = 3
+ self.detection_history = deque(maxlen=self.detection_history_size)
+
+ # For direct control mode visualization
+ self.last_position_error = None
+ self.last_target_reached = False
+
+ logger.info(
+ f"Initialized PBVS system with controller gains: pos={position_gain}, rot={rotation_gain}, "
+ f"tracking_thresholds: distance={max_tracking_distance_threshold}m, size={min_size_similarity:.2f}"
+ )
+
+ def set_target(self, target_object: Detection3D) -> bool:
+ """
+ Set a new target object for servoing.
+
+ Args:
+ target_object: Detection3D object
+
+ Returns:
+ True if target was set successfully
+ """
+ if target_object and target_object.bbox and target_object.bbox.center:
+ self.current_target = target_object
+ self.target_grasp_pose = None # Will be computed when needed
+ logger.info(f"New target set: ID {target_object.id}")
+ return True
+ return False
+
+ def clear_target(self):
+ """Clear the current target."""
+ self.current_target = None
+ self.target_grasp_pose = None
+ self.last_position_error = None
+ self.last_target_reached = False
+ self.detection_history.clear()
+ if self.controller:
+ self.controller.clear_state()
+ logger.info("Target cleared")
+
+ def get_current_target(self) -> Optional[Detection3D]:
+ """
+ Get the current target object.
+
+ Returns:
+ Current target Detection3D or None if no target selected
+ """
+ return self.current_target
+
+ def update_tracking(self, new_detections: Optional[Detection3DArray] = None) -> bool:
+ """
+ Update target tracking with new detections using a rolling window.
+ If tracking is lost, keeps the old target pose.
+
+ Args:
+ new_detections: Optional new detections for target tracking
+
+ Returns:
+ True if target was successfully tracked, False if lost (but target is kept)
+ """
+ # Check if we have a current target
+ if (
+ not self.current_target
+ or not self.current_target.bbox
+ or not self.current_target.bbox.center
+ ):
+ return False
+
+ # Add new detections to history if provided
+ if new_detections is not None and new_detections.detections_length > 0:
+ self.detection_history.append(new_detections)
+
+ # If no detection history, can't track
+ if not self.detection_history:
+ logger.debug("No detection history for target tracking - using last known pose")
+ return False
+
+ # Collect all candidates from detection history
+ all_candidates = []
+ for detection_array in self.detection_history:
+ all_candidates.extend(detection_array.detections)
+
+ if not all_candidates:
+ logger.debug("No candidates in detection history")
+ return False
+
+ # Use stage-dependent distance threshold
+ max_distance = self.max_tracking_distance_threshold
+
+ # Find best match across all recent detections
+ match_result = find_best_object_match(
+ target_obj=self.current_target,
+ candidates=all_candidates,
+ max_distance=max_distance,
+ min_size_similarity=self.min_size_similarity,
+ )
+
+ if match_result.is_valid_match:
+ self.current_target = match_result.matched_object
+ self.target_grasp_pose = None # Recompute grasp pose
+ logger.debug(
+ f"Target tracking successful: distance={match_result.distance:.3f}m, "
+ f"size_similarity={match_result.size_similarity:.2f}, "
+ f"confidence={match_result.confidence:.2f}"
+ )
+ return True
+
+ logger.debug(
+ f"Target tracking lost across {len(self.detection_history)} frames: "
+ f"distance={match_result.distance:.3f}m, "
+ f"size_similarity={match_result.size_similarity:.2f}, "
+ f"thresholds: distance={max_distance:.3f}m, size={self.min_size_similarity:.2f}"
+ )
+ return False
+
+ def compute_control(
+ self,
+ ee_pose: Pose,
+ grasp_distance: float = 0.15,
+ grasp_pitch_degrees: float = 45.0,
+ ) -> Tuple[Optional[Vector3], Optional[Vector3], bool, bool, Optional[Pose]]:
+ """
+ Compute PBVS control with position and orientation servoing.
+
+ Args:
+ ee_pose: Current end-effector pose
+ grasp_distance: Distance to maintain from target (meters)
+
+ Returns:
+ Tuple of (velocity_command, angular_velocity_command, target_reached, has_target, target_pose)
+ - velocity_command: Linear velocity vector or None if no target (None in direct_ee_control mode)
+ - angular_velocity_command: Angular velocity vector or None if no target (None in direct_ee_control mode)
+ - target_reached: True if within target tolerance
+ - has_target: True if currently tracking a target
+ - target_pose: Target EE pose (only in direct_ee_control mode, otherwise None)
+ """
+ # Check if we have a target
+ if not self.current_target:
+ return None, None, False, False, None
+
+ # Update target grasp pose with provided distance and pitch
+ self.target_grasp_pose = update_target_grasp_pose(
+ self.current_target.bbox.center, ee_pose, grasp_distance, grasp_pitch_degrees
+ )
+
+ if self.target_grasp_pose is None:
+ logger.warning("Failed to compute grasp pose")
+ return None, None, False, False, None
+
+ # Compute errors for visualization before checking if reached (in case pose gets cleared)
+ if self.direct_ee_control and self.target_grasp_pose:
+ self.last_position_error = Vector3(
+ self.target_grasp_pose.position.x - ee_pose.position.x,
+ self.target_grasp_pose.position.y - ee_pose.position.y,
+ self.target_grasp_pose.position.z - ee_pose.position.z,
+ )
+
+ # Check if target reached using our separate function
+ target_reached = is_target_reached(self.target_grasp_pose, ee_pose, self.target_tolerance)
+
+ # Return appropriate values based on control mode
+ if self.direct_ee_control:
+ # Direct control mode
+ if self.target_grasp_pose:
+ self.last_target_reached = target_reached
+ # Return has_target=True since we have a target
+ return None, None, target_reached, True, self.target_grasp_pose
+ else:
+ return None, None, False, True, None
+ else:
+ # Velocity control mode - use controller
+ velocity_cmd, angular_velocity_cmd, controller_reached = (
+ self.controller.compute_control(ee_pose, self.target_grasp_pose)
+ )
+ # Return has_target=True since we have a target, regardless of tracking status
+ return velocity_cmd, angular_velocity_cmd, target_reached, True, None
+
+ def create_status_overlay(
+ self,
+ image: np.ndarray,
+ grasp_stage=None,
+ ) -> np.ndarray:
+ """
+ Create PBVS status overlay on image.
+
+ Args:
+ image: Input image
+ grasp_stage: Current grasp stage (optional)
+
+ Returns:
+ Image with PBVS status overlay
+ """
+ stage_value = grasp_stage.value if grasp_stage else "idle"
+ return create_pbvs_visualization(
+ image,
+ self.current_target,
+ self.last_position_error,
+ self.last_target_reached,
+ stage_value,
+ )
+
+
+class PBVSController:
+ """
+ Low-level Position-Based Visual Servoing controller.
+ Pure control logic that computes velocity commands from poses.
+
+ Handles:
+ - Position and orientation error computation
+ - Velocity command generation with gain control
+ - Target reached detection
+ """
+
+ def __init__(
+ self,
+ position_gain: float = 0.5,
+ rotation_gain: float = 0.3,
+ max_velocity: float = 0.1, # m/s
+ max_angular_velocity: float = 0.5, # rad/s
+ target_tolerance: float = 0.01, # 1cm
+ ):
+ """
+ Initialize PBVS controller.
+
+ Args:
+ position_gain: Proportional gain for position control
+ rotation_gain: Proportional gain for rotation control
+ max_velocity: Maximum linear velocity command magnitude (m/s)
+ max_angular_velocity: Maximum angular velocity command magnitude (rad/s)
+ target_tolerance: Distance threshold for considering target reached (m)
+ """
+ self.position_gain = position_gain
+ self.rotation_gain = rotation_gain
+ self.max_velocity = max_velocity
+ self.max_angular_velocity = max_angular_velocity
+ self.target_tolerance = target_tolerance
+
+ self.last_position_error = None
+ self.last_rotation_error = None
+ self.last_velocity_cmd = None
+ self.last_angular_velocity_cmd = None
+ self.last_target_reached = False
+
+ logger.info(
+ f"Initialized PBVS controller: pos_gain={position_gain}, rot_gain={rotation_gain}, "
+ f"max_vel={max_velocity}m/s, max_ang_vel={max_angular_velocity}rad/s, "
+ f"target_tolerance={target_tolerance}m"
+ )
+
+ def clear_state(self):
+ """Clear controller state."""
+ self.last_position_error = None
+ self.last_rotation_error = None
+ self.last_velocity_cmd = None
+ self.last_angular_velocity_cmd = None
+ self.last_target_reached = False
+
+ def compute_control(
+ self, ee_pose: Pose, grasp_pose: Pose
+ ) -> Tuple[Optional[Vector3], Optional[Vector3], bool]:
+ """
+ Compute PBVS control with position and orientation servoing.
+
+ Args:
+ ee_pose: Current end-effector pose
+ grasp_pose: Target grasp pose
+
+ Returns:
+ Tuple of (velocity_command, angular_velocity_command, target_reached)
+ - velocity_command: Linear velocity vector
+ - angular_velocity_command: Angular velocity vector
+ - target_reached: True if within target tolerance
+ """
+ # Calculate position error (target - EE position)
+ error = Vector3(
+ grasp_pose.position.x - ee_pose.position.x,
+ grasp_pose.position.y - ee_pose.position.y,
+ grasp_pose.position.z - ee_pose.position.z,
+ )
+ self.last_position_error = error
+
+ # Compute velocity command with proportional control
+ velocity_cmd = Vector3(
+ error.x * self.position_gain,
+ error.y * self.position_gain,
+ error.z * self.position_gain,
+ )
+
+ # Limit velocity magnitude
+ vel_magnitude = np.linalg.norm([velocity_cmd.x, velocity_cmd.y, velocity_cmd.z])
+ if vel_magnitude > self.max_velocity:
+ scale = self.max_velocity / vel_magnitude
+ velocity_cmd = Vector3(
+ float(velocity_cmd.x * scale),
+ float(velocity_cmd.y * scale),
+ float(velocity_cmd.z * scale),
+ )
+
+ self.last_velocity_cmd = velocity_cmd
+
+ # Compute angular velocity for orientation control
+ angular_velocity_cmd = self._compute_angular_velocity(grasp_pose.orientation, ee_pose)
+
+ # Check if target reached
+ error_magnitude = np.linalg.norm([error.x, error.y, error.z])
+ target_reached = bool(error_magnitude < self.target_tolerance)
+ self.last_target_reached = target_reached
+
+ return velocity_cmd, angular_velocity_cmd, target_reached
+
+ def _compute_angular_velocity(self, target_rot: Quaternion, current_pose: Pose) -> Vector3:
+ """
+ Compute angular velocity commands for orientation control.
+ Uses quaternion error computation for better numerical stability.
+
+ Args:
+ target_rot: Target orientation (quaternion)
+ current_pose: Current EE pose
+
+ Returns:
+ Angular velocity command as Vector3
+ """
+ # Use quaternion error for better numerical stability
+
+ # Convert to scipy Rotation objects
+ target_rot_scipy = R.from_quat([target_rot.x, target_rot.y, target_rot.z, target_rot.w])
+ current_rot_scipy = R.from_quat(
+ [
+ current_pose.orientation.x,
+ current_pose.orientation.y,
+ current_pose.orientation.z,
+ current_pose.orientation.w,
+ ]
+ )
+
+ # Compute rotation error: error = target * current^(-1)
+ error_rot = target_rot_scipy * current_rot_scipy.inv()
+
+ # Convert to axis-angle representation for control
+ error_axis_angle = error_rot.as_rotvec()
+
+ # Use axis-angle directly as angular velocity error (small angle approximation)
+ roll_error = error_axis_angle[0]
+ pitch_error = error_axis_angle[1]
+ yaw_error = error_axis_angle[2]
+
+ self.last_rotation_error = Vector3(roll_error, pitch_error, yaw_error)
+
+ # Apply proportional control
+ angular_velocity = Vector3(
+ roll_error * self.rotation_gain,
+ pitch_error * self.rotation_gain,
+ yaw_error * self.rotation_gain,
+ )
+
+ # Limit angular velocity magnitude
+ ang_vel_magnitude = np.sqrt(
+ angular_velocity.x**2 + angular_velocity.y**2 + angular_velocity.z**2
+ )
+ if ang_vel_magnitude > self.max_angular_velocity:
+ scale = self.max_angular_velocity / ang_vel_magnitude
+ angular_velocity = Vector3(
+ angular_velocity.x * scale, angular_velocity.y * scale, angular_velocity.z * scale
+ )
+
+ self.last_angular_velocity_cmd = angular_velocity
+
+ return angular_velocity
+
+ def create_status_overlay(
+ self,
+ image: np.ndarray,
+ current_target: Optional[Detection3D] = None,
+ ) -> np.ndarray:
+ """
+ Create PBVS status overlay on image.
+
+ Args:
+ image: Input image
+ current_target: Current target object Detection3D (for display)
+
+ Returns:
+ Image with PBVS status overlay
+ """
+ return create_pbvs_visualization(
+ image,
+ current_target,
+ self.last_position_error,
+ self.last_target_reached,
+ "velocity_control",
+ )
diff --git a/dimos/manipulation/visual_servoing/utils.py b/dimos/manipulation/visual_servoing/utils.py
new file mode 100644
index 0000000000..4546326ef6
--- /dev/null
+++ b/dimos/manipulation/visual_servoing/utils.py
@@ -0,0 +1,828 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from typing import Dict, Any, Optional, List, Tuple, Union
+from dataclasses import dataclass
+
+from dimos_lcm.geometry_msgs import Pose, Vector3, Quaternion, Point
+from dimos_lcm.vision_msgs import Detection3D, Detection2D
+import cv2
+from dimos.perception.detection2d.utils import plot_results
+from dimos.perception.common.utils import project_2d_points_to_3d
+from dimos.utils.transform_utils import (
+ optical_to_robot_frame,
+ robot_to_optical_frame,
+ pose_to_matrix,
+ matrix_to_pose,
+ euler_to_quaternion,
+ compose_transforms,
+ yaw_towards_point,
+ get_distance,
+)
+
+
+def match_detection_by_id(
+ detection_3d: Detection3D, detections_3d: List[Detection3D], detections_2d: List[Detection2D]
+) -> Optional[Detection2D]:
+ """
+ Find the corresponding Detection2D for a given Detection3D.
+
+ Args:
+ detection_3d: The Detection3D to match
+ detections_3d: List of all Detection3D objects
+ detections_2d: List of all Detection2D objects (must be 1:1 correspondence)
+
+ Returns:
+ Corresponding Detection2D if found, None otherwise
+ """
+ for i, det_3d in enumerate(detections_3d):
+ if det_3d.id == detection_3d.id and i < len(detections_2d):
+ return detections_2d[i]
+ return None
+
+
+def transform_pose(
+ obj_pos: np.ndarray,
+ obj_orientation: np.ndarray,
+ transform_matrix: np.ndarray,
+ to_optical: bool = False,
+ to_robot: bool = False,
+) -> Pose:
+ """
+ Transform object pose with optional frame convention conversion.
+
+ Args:
+ obj_pos: Object position [x, y, z]
+ obj_orientation: Object orientation [roll, pitch, yaw] in radians
+ transform_matrix: 4x4 transformation matrix from camera frame to desired frame
+ to_optical: If True, input is in robot frame → convert result to optical frame
+ to_robot: If True, input is in optical frame → convert to robot frame first
+
+ Returns:
+ Object pose in desired frame as Pose
+ """
+ # Convert euler angles to quaternion using utility function
+ euler_vector = Vector3(obj_orientation[0], obj_orientation[1], obj_orientation[2])
+ obj_orientation_quat = euler_to_quaternion(euler_vector)
+
+ input_pose = Pose(Point(obj_pos[0], obj_pos[1], obj_pos[2]), obj_orientation_quat)
+
+ # Apply input frame conversion based on flags
+ if to_robot:
+ # Input is in optical frame → convert to robot frame first
+ pose_for_transform = optical_to_robot_frame(input_pose)
+ else:
+ # Default or to_optical: use input pose as-is
+ pose_for_transform = input_pose
+
+ # Create transformation matrix from pose (relative to camera)
+ T_camera_object = pose_to_matrix(pose_for_transform)
+
+ # Use compose_transforms to combine transformations
+ T_desired_object = compose_transforms(transform_matrix, T_camera_object)
+
+ # Convert back to pose
+ result_pose = matrix_to_pose(T_desired_object)
+
+ # Apply output frame conversion based on flags
+ if to_optical:
+ # Input was robot frame → convert result to optical frame
+ desired_pose = robot_to_optical_frame(result_pose)
+ else:
+ # Default or to_robot: use result as-is
+ desired_pose = result_pose
+
+ return desired_pose
+
+
+def transform_points_3d(
+ points_3d: np.ndarray,
+ transform_matrix: np.ndarray,
+ to_optical: bool = False,
+ to_robot: bool = False,
+) -> np.ndarray:
+ """
+ Transform 3D points with optional frame convention conversion.
+ Applies the same transformation pipeline as transform_pose but for multiple points.
+
+ Args:
+ points_3d: Nx3 array of 3D points [x, y, z]
+ transform_matrix: 4x4 transformation matrix from camera frame to desired frame
+ to_optical: If True, input is in robot frame → convert result to optical frame
+ to_robot: If True, input is in optical frame → convert to robot frame first
+
+ Returns:
+ Nx3 array of transformed 3D points in desired frame
+ """
+ if points_3d.size == 0:
+ return np.zeros((0, 3), dtype=np.float32)
+
+ points_3d = np.asarray(points_3d)
+ if points_3d.ndim == 1:
+ points_3d = points_3d.reshape(1, -1)
+
+ transformed_points = []
+
+ for point in points_3d:
+ input_point_pose = Pose(
+ Point(point[0], point[1], point[2]),
+ Quaternion(0.0, 0.0, 0.0, 1.0), # Identity quaternion
+ )
+
+ # Apply input frame conversion based on flags
+ if to_robot:
+ # Input is in optical frame → convert to robot frame first
+ pose_for_transform = optical_to_robot_frame(input_point_pose)
+ else:
+ # Default or to_optical: use input pose as-is
+ pose_for_transform = input_point_pose
+
+ # Create transformation matrix from point pose (relative to camera)
+ T_camera_point = pose_to_matrix(pose_for_transform)
+
+ # Use compose_transforms to combine transformations
+ T_desired_point = compose_transforms(transform_matrix, T_camera_point)
+
+ # Convert back to pose
+ result_pose = matrix_to_pose(T_desired_point)
+
+ # Apply output frame conversion based on flags
+ if to_optical:
+ # Input was robot frame → convert result to optical frame
+ desired_pose = robot_to_optical_frame(result_pose)
+ else:
+ # Default or to_robot: use result as-is
+ desired_pose = result_pose
+
+ transformed_point = [
+ desired_pose.position.x,
+ desired_pose.position.y,
+ desired_pose.position.z,
+ ]
+ transformed_points.append(transformed_point)
+
+ return np.array(transformed_points, dtype=np.float32)
+
+
+def select_points_from_depth(
+ depth_image: np.ndarray,
+ target_point: Tuple[int, int],
+ camera_intrinsics: Union[List[float], np.ndarray],
+ radius: int = 5,
+) -> np.ndarray:
+ """
+ Select points around a target point within a bounding box and project them to 3D.
+
+ Args:
+ depth_image: Depth image in meters (H, W)
+ target_point: (x, y) target point coordinates
+ radius: Half-width of the bounding box (so bbox size is radius*2 x radius*2)
+ camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix
+
+ Returns:
+ Nx3 array of 3D points (X, Y, Z) in camera frame
+ """
+ x_target, y_target = target_point
+ height, width = depth_image.shape
+
+ x_min = max(0, x_target - radius)
+ x_max = min(width, x_target + radius)
+ y_min = max(0, y_target - radius)
+ y_max = min(height, y_target + radius)
+
+ # Create coordinate grids for the bounding box (vectorized)
+ y_coords, x_coords = np.meshgrid(range(y_min, y_max), range(x_min, x_max), indexing="ij")
+
+ # Flatten to get all coordinate pairs
+ x_flat = x_coords.flatten()
+ y_flat = y_coords.flatten()
+
+ # Extract corresponding depth values using advanced indexing
+ depth_flat = depth_image[y_flat, x_flat]
+
+ valid_mask = (depth_flat > 0) & np.isfinite(depth_flat)
+
+ if not np.any(valid_mask):
+ return np.zeros((0, 3), dtype=np.float32)
+
+ points_2d = np.column_stack([x_flat[valid_mask], y_flat[valid_mask]]).astype(np.float32)
+ depth_values = depth_flat[valid_mask].astype(np.float32)
+
+ points_3d = project_2d_points_to_3d(points_2d, depth_values, camera_intrinsics)
+
+ return points_3d
+
+
+def update_target_grasp_pose(
+ target_pose: Pose, ee_pose: Pose, grasp_distance: float = 0.0, grasp_pitch_degrees: float = 45.0
+) -> Optional[Pose]:
+ """
+ Update target grasp pose based on current target pose and EE pose.
+
+ Args:
+ target_pose: Target pose to grasp
+ ee_pose: Current end-effector pose
+ grasp_distance: Distance to maintain from target (pregrasp or grasp distance)
+ grasp_pitch_degrees: Grasp pitch angle in degrees (default 90° for top-down)
+
+ Returns:
+ Target grasp pose or None if target is invalid
+ """
+
+ target_pos = target_pose.position
+
+ # Calculate orientation pointing from target towards EE
+ yaw_to_ee = yaw_towards_point(target_pos, ee_pose.position)
+
+ # Create target pose with proper orientation
+ # Convert grasp pitch from degrees to radians with mapping:
+ # 0° (level) -> π/2 (1.57 rad), 90° (top-down) -> π (3.14 rad)
+ pitch_radians = 1.57 + np.radians(grasp_pitch_degrees)
+
+ # Convert euler angles to quaternion using utility function
+ euler = Vector3(0.0, pitch_radians, yaw_to_ee) # roll=0, pitch=mapped, yaw=calculated
+ target_orientation = euler_to_quaternion(euler)
+
+ updated_pose = Pose(target_pos, target_orientation)
+
+ if grasp_distance > 0.0:
+ return apply_grasp_distance(updated_pose, grasp_distance)
+ else:
+ return updated_pose
+
+
+def apply_grasp_distance(target_pose: Pose, distance: float) -> Pose:
+ """
+ Apply grasp distance offset to target pose along its approach direction.
+
+ Args:
+ target_pose: Target grasp pose
+ distance: Distance to offset along the approach direction (meters)
+
+ Returns:
+ Target pose offset by the specified distance along its approach direction
+ """
+ # Convert pose to transformation matrix to extract rotation
+ T_target = pose_to_matrix(target_pose)
+ rotation_matrix = T_target[:3, :3]
+
+ # Define the approach vector based on the target pose orientation
+ # Assuming the gripper approaches along its local -z axis (common for downward grasps)
+ # You can change this to [1, 0, 0] for x-axis or [0, 1, 0] for y-axis based on your gripper
+ approach_vector_local = np.array([0, 0, -1])
+
+ # Transform approach vector to world coordinates
+ approach_vector_world = rotation_matrix @ approach_vector_local
+
+ # Apply offset along the approach direction
+ offset_position = Point(
+ target_pose.position.x + distance * approach_vector_world[0],
+ target_pose.position.y + distance * approach_vector_world[1],
+ target_pose.position.z + distance * approach_vector_world[2],
+ )
+
+ return Pose(offset_position, target_pose.orientation)
+
+
+def is_target_reached(target_pose: Pose, current_pose: Pose, tolerance: float = 0.01) -> bool:
+ """
+ Check if the target pose has been reached within tolerance.
+
+ Args:
+ target_pose: Target pose to reach
+ current_pose: Current pose (e.g., end-effector pose)
+ tolerance: Distance threshold for considering target reached (meters, default 0.01 = 1cm)
+
+ Returns:
+ True if target is reached within tolerance, False otherwise
+ """
+ # Calculate position error using distance utility
+ error_magnitude = get_distance(target_pose, current_pose)
+ return error_magnitude < tolerance
+
+
+@dataclass
+class ObjectMatchResult:
+ """Result of object matching with confidence metrics."""
+
+ matched_object: Optional[Detection3D]
+ confidence: float
+ distance: float
+ size_similarity: float
+ is_valid_match: bool
+
+
+def calculate_object_similarity(
+ target_obj: Detection3D,
+ candidate_obj: Detection3D,
+ distance_weight: float = 0.6,
+ size_weight: float = 0.4,
+) -> Tuple[float, float, float]:
+ """
+ Calculate comprehensive similarity between two objects.
+
+ Args:
+ target_obj: Target Detection3D object
+ candidate_obj: Candidate Detection3D object
+ distance_weight: Weight for distance component (0-1)
+ size_weight: Weight for size component (0-1)
+
+ Returns:
+ Tuple of (total_similarity, distance_m, size_similarity)
+ """
+ # Extract positions
+ target_pos = target_obj.bbox.center.position
+ candidate_pos = candidate_obj.bbox.center.position
+
+ target_xyz = np.array([target_pos.x, target_pos.y, target_pos.z])
+ candidate_xyz = np.array([candidate_pos.x, candidate_pos.y, candidate_pos.z])
+
+ # Calculate Euclidean distance
+ distance = np.linalg.norm(target_xyz - candidate_xyz)
+ distance_similarity = 1.0 / (1.0 + distance) # Exponential decay
+
+ # Calculate size similarity by comparing each dimension individually
+ size_similarity = 1.0 # Default if no size info
+ target_size = target_obj.bbox.size
+ candidate_size = candidate_obj.bbox.size
+
+ if target_size and candidate_size:
+ # Extract dimensions
+ target_dims = [target_size.x, target_size.y, target_size.z]
+ candidate_dims = [candidate_size.x, candidate_size.y, candidate_size.z]
+
+ # Calculate similarity for each dimension pair
+ dim_similarities = []
+ for target_dim, candidate_dim in zip(target_dims, candidate_dims):
+ if target_dim == 0.0 and candidate_dim == 0.0:
+ dim_similarities.append(1.0) # Both dimensions are zero
+ elif target_dim == 0.0 or candidate_dim == 0.0:
+ dim_similarities.append(0.0) # One dimension is zero, other is not
+ else:
+ # Calculate similarity as min/max ratio
+ max_dim = max(target_dim, candidate_dim)
+ min_dim = min(target_dim, candidate_dim)
+ dim_similarity = min_dim / max_dim if max_dim > 0 else 0.0
+ dim_similarities.append(dim_similarity)
+
+ # Return average similarity across all dimensions
+ size_similarity = np.mean(dim_similarities) if dim_similarities else 0.0
+
+ # Weighted combination
+ total_similarity = distance_weight * distance_similarity + size_weight * size_similarity
+
+ return total_similarity, distance, size_similarity
+
+
+def find_best_object_match(
+ target_obj: Detection3D,
+ candidates: List[Detection3D],
+ max_distance: float = 0.1,
+ min_size_similarity: float = 0.4,
+ distance_weight: float = 0.7,
+ size_weight: float = 0.3,
+) -> ObjectMatchResult:
+ """
+ Find the best matching object from candidates using distance and size criteria.
+
+ Args:
+ target_obj: Target Detection3D to match against
+ candidates: List of candidate Detection3D objects
+ max_distance: Maximum allowed distance for valid match (meters)
+ min_size_similarity: Minimum size similarity for valid match (0-1)
+ distance_weight: Weight for distance in similarity calculation
+ size_weight: Weight for size in similarity calculation
+
+ Returns:
+ ObjectMatchResult with best match and confidence metrics
+ """
+ if not candidates or not target_obj.bbox or not target_obj.bbox.center:
+ return ObjectMatchResult(None, 0.0, float("inf"), 0.0, False)
+
+ best_match = None
+ best_confidence = 0.0
+ best_distance = float("inf")
+ best_size_sim = 0.0
+
+ for candidate in candidates:
+ if not candidate.bbox or not candidate.bbox.center:
+ continue
+
+ similarity, distance, size_sim = calculate_object_similarity(
+ target_obj, candidate, distance_weight, size_weight
+ )
+
+ # Check validity constraints
+ is_valid = distance <= max_distance and size_sim >= min_size_similarity
+
+ if is_valid and similarity > best_confidence:
+ best_match = candidate
+ best_confidence = similarity
+ best_distance = distance
+ best_size_sim = size_sim
+
+ return ObjectMatchResult(
+ matched_object=best_match,
+ confidence=best_confidence,
+ distance=best_distance,
+ size_similarity=best_size_sim,
+ is_valid_match=best_match is not None,
+ )
+
+
+def parse_zed_pose(zed_pose_data: Dict[str, Any]) -> Optional[Pose]:
+ """
+ Parse ZED pose data dictionary into a Pose object.
+
+ Args:
+ zed_pose_data: Dictionary from ZEDCamera.get_pose() containing:
+ - position: [x, y, z] in meters
+ - rotation: [x, y, z, w] quaternion
+ - euler_angles: [roll, pitch, yaw] in radians
+ - valid: Whether pose is valid
+
+ Returns:
+ Pose object with position and orientation, or None if invalid
+ """
+ if not zed_pose_data or not zed_pose_data.get("valid", False):
+ return None
+
+ # Extract position
+ position = zed_pose_data.get("position", [0, 0, 0])
+ pos_vector = Point(position[0], position[1], position[2])
+
+ quat = zed_pose_data["rotation"]
+ orientation = Quaternion(quat[0], quat[1], quat[2], quat[3])
+ return Pose(pos_vector, orientation)
+
+
+def estimate_object_depth(
+ depth_image: np.ndarray, segmentation_mask: Optional[np.ndarray], bbox: List[float]
+) -> float:
+ """
+ Estimate object depth dimension using segmentation mask and depth data.
+ Optimized for real-time performance.
+
+ Args:
+ depth_image: Depth image in meters
+ segmentation_mask: Binary segmentation mask for the object
+ bbox: Bounding box [x1, y1, x2, y2]
+
+ Returns:
+ Estimated object depth in meters
+ """
+ x1, y1, x2, y2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
+
+ # Extract depth ROI once
+ roi_depth = depth_image[y1:y2, x1:x2]
+
+ if segmentation_mask is not None and segmentation_mask.size > 0:
+ # Extract mask ROI efficiently
+ mask_roi = (
+ segmentation_mask[y1:y2, x1:x2]
+ if segmentation_mask.shape != roi_depth.shape
+ else segmentation_mask
+ )
+
+ # Fast mask application using boolean indexing
+ valid_mask = mask_roi > 0
+ if np.sum(valid_mask) > 10: # Early exit if not enough points
+ masked_depths = roi_depth[valid_mask]
+
+ # Fast percentile calculation using numpy's optimized functions
+ depth_90 = np.percentile(masked_depths, 90)
+ depth_10 = np.percentile(masked_depths, 10)
+ depth_range = depth_90 - depth_10
+
+ # Clamp to reasonable bounds with single operation
+ return np.clip(depth_range, 0.02, 0.5)
+
+ # Fast fallback using area calculation
+ bbox_area = (x2 - x1) * (y2 - y1)
+
+ # Vectorized area-based estimation
+ if bbox_area > 10000:
+ return 0.15
+ elif bbox_area > 5000:
+ return 0.10
+ else:
+ return 0.05
+
+
+# ============= Visualization Functions =============
+
+
+def create_manipulation_visualization(
+ rgb_image: np.ndarray,
+ feedback,
+ detection_3d_array=None,
+ detection_2d_array=None,
+) -> np.ndarray:
+ """
+ Create simple visualization for manipulation class using feedback.
+
+ Args:
+ rgb_image: RGB image array
+ feedback: Feedback object containing all state information
+ detection_3d_array: Optional 3D detections for object visualization
+ detection_2d_array: Optional 2D detections for object visualization
+
+ Returns:
+ BGR image with visualization overlays
+ """
+ # Convert to BGR for OpenCV
+ viz = cv2.cvtColor(rgb_image, cv2.COLOR_RGB2BGR)
+
+ # Draw detections if available
+ if detection_3d_array and detection_2d_array:
+ # Extract 2D bboxes
+ bboxes_2d = []
+ for det_2d in detection_2d_array.detections:
+ if det_2d.bbox:
+ x1 = det_2d.bbox.center.position.x - det_2d.bbox.size_x / 2
+ y1 = det_2d.bbox.center.position.y - det_2d.bbox.size_y / 2
+ x2 = det_2d.bbox.center.position.x + det_2d.bbox.size_x / 2
+ y2 = det_2d.bbox.center.position.y + det_2d.bbox.size_y / 2
+ bboxes_2d.append([x1, y1, x2, y2])
+
+ # Draw basic detections
+ rgb_with_detections = visualize_detections_3d(
+ rgb_image, detection_3d_array.detections, show_coordinates=True, bboxes_2d=bboxes_2d
+ )
+ viz = cv2.cvtColor(rgb_with_detections, cv2.COLOR_RGB2BGR)
+
+ # Add manipulation status overlay
+ status_y = 30
+ cv2.putText(
+ viz,
+ "Eye-in-Hand Visual Servoing",
+ (10, status_y),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.6,
+ (0, 255, 255),
+ 2,
+ )
+
+ # Stage information
+ stage_text = f"Stage: {feedback.grasp_stage.value.upper()}"
+ stage_color = {
+ "idle": (100, 100, 100),
+ "pre_grasp": (0, 255, 255),
+ "grasp": (0, 255, 0),
+ "close_and_retract": (255, 0, 255),
+ "place": (0, 150, 255),
+ "retract": (255, 150, 0),
+ }.get(feedback.grasp_stage.value, (255, 255, 255))
+
+ cv2.putText(
+ viz,
+ stage_text,
+ (10, status_y + 25),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.5,
+ stage_color,
+ 1,
+ )
+
+ # Target tracking status
+ if feedback.target_tracked:
+ cv2.putText(
+ viz,
+ "Target: TRACKED",
+ (10, status_y + 45),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.5,
+ (0, 255, 0),
+ 1,
+ )
+ elif feedback.grasp_stage.value != "idle":
+ cv2.putText(
+ viz,
+ "Target: LOST",
+ (10, status_y + 45),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.5,
+ (0, 0, 255),
+ 1,
+ )
+
+ # Waiting status
+ if feedback.waiting_for_reach:
+ cv2.putText(
+ viz,
+ "Status: WAITING FOR ROBOT",
+ (10, status_y + 65),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.5,
+ (255, 255, 0),
+ 1,
+ )
+
+ # Overall result
+ if feedback.success is not None:
+ result_text = "Pick & Place: SUCCESS" if feedback.success else "Pick & Place: FAILED"
+ result_color = (0, 255, 0) if feedback.success else (0, 0, 255)
+ cv2.putText(
+ viz,
+ result_text,
+ (10, status_y + 85),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.5,
+ result_color,
+ 2,
+ )
+
+ # Control hints (bottom of image)
+ hint_text = "Click object to grasp | s=STOP | r=RESET | g=RELEASE"
+ cv2.putText(
+ viz,
+ hint_text,
+ (10, viz.shape[0] - 10),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.4,
+ (200, 200, 200),
+ 1,
+ )
+
+ return viz
+
+
+def create_pbvs_visualization(
+ image: np.ndarray,
+ current_target=None,
+ position_error=None,
+ target_reached=False,
+ grasp_stage="idle",
+) -> np.ndarray:
+ """
+ Create simple PBVS visualization overlay.
+
+ Args:
+ image: Input image (RGB or BGR)
+ current_target: Current target Detection3D
+ position_error: Position error Vector3
+ target_reached: Whether target is reached
+ grasp_stage: Current grasp stage string
+
+ Returns:
+ Image with PBVS overlay
+ """
+ viz = image.copy()
+
+ # Only show PBVS info if we have a target
+ if current_target is None:
+ return viz
+
+ # Create status panel at bottom
+ height, width = viz.shape[:2]
+ panel_height = 100
+ panel_y = height - panel_height
+
+ # Semi-transparent overlay
+ overlay = viz.copy()
+ cv2.rectangle(overlay, (0, panel_y), (width, height), (0, 0, 0), -1)
+ viz = cv2.addWeighted(viz, 0.7, overlay, 0.3, 0)
+
+ # PBVS Status
+ y_offset = panel_y + 20
+ cv2.putText(
+ viz,
+ "PBVS Control",
+ (10, y_offset),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.6,
+ (0, 255, 255),
+ 2,
+ )
+
+ # Position error
+ if position_error:
+ error_mag = np.linalg.norm([position_error.x, position_error.y, position_error.z])
+ error_text = f"Error: {error_mag * 100:.1f}cm"
+ error_color = (0, 255, 0) if target_reached else (0, 255, 255)
+ cv2.putText(
+ viz,
+ error_text,
+ (10, y_offset + 25),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.5,
+ error_color,
+ 1,
+ )
+
+ # Stage
+ cv2.putText(
+ viz,
+ f"Stage: {grasp_stage}",
+ (10, y_offset + 45),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.5,
+ (255, 150, 255),
+ 1,
+ )
+
+ # Target reached indicator
+ if target_reached:
+ cv2.putText(
+ viz,
+ "TARGET REACHED",
+ (width - 150, y_offset + 25),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.6,
+ (0, 255, 0),
+ 2,
+ )
+
+ return viz
+
+
+def visualize_detections_3d(
+ rgb_image: np.ndarray,
+ detections: List[Detection3D],
+ show_coordinates: bool = True,
+ bboxes_2d: Optional[List[List[float]]] = None,
+) -> np.ndarray:
+ """
+ Visualize detections with 3D position overlay next to bounding boxes.
+
+ Args:
+ rgb_image: Original RGB image
+ detections: List of Detection3D objects
+ show_coordinates: Whether to show 3D coordinates next to bounding boxes
+ bboxes_2d: Optional list of 2D bounding boxes corresponding to detections
+
+ Returns:
+ Visualization image
+ """
+ if not detections:
+ return rgb_image.copy()
+
+ # If no 2D bboxes provided, skip visualization
+ if bboxes_2d is None:
+ return rgb_image.copy()
+
+ # Extract data for plot_results function
+ bboxes = bboxes_2d
+ track_ids = [int(det.id) if det.id.isdigit() else i for i, det in enumerate(detections)]
+ class_ids = [i for i in range(len(detections))]
+ confidences = [
+ det.results[0].hypothesis.score if det.results_length > 0 else 0.0 for det in detections
+ ]
+ names = [
+ det.results[0].hypothesis.class_id if det.results_length > 0 else "unknown"
+ for det in detections
+ ]
+
+ # Use plot_results for basic visualization
+ viz = plot_results(rgb_image, bboxes, track_ids, class_ids, confidences, names)
+
+ # Add 3D position coordinates if requested
+ if show_coordinates and bboxes_2d is not None:
+ for i, det in enumerate(detections):
+ if det.bbox and det.bbox.center and i < len(bboxes_2d):
+ position = det.bbox.center.position
+ bbox = bboxes_2d[i]
+
+ pos_xyz = np.array([position.x, position.y, position.z])
+
+ # Get bounding box coordinates
+ x1, y1, x2, y2 = map(int, bbox)
+
+ # Add position text next to bounding box (top-right corner)
+ pos_text = f"({pos_xyz[0]:.2f}, {pos_xyz[1]:.2f}, {pos_xyz[2]:.2f})"
+ text_x = x2 + 5 # Right edge of bbox + small offset
+ text_y = y1 + 15 # Top edge of bbox + small offset
+
+ # Add background rectangle for better readability
+ text_size = cv2.getTextSize(pos_text, cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1)[0]
+ cv2.rectangle(
+ viz,
+ (text_x - 2, text_y - text_size[1] - 2),
+ (text_x + text_size[0] + 2, text_y + 2),
+ (0, 0, 0),
+ -1,
+ )
+
+ cv2.putText(
+ viz,
+ pos_text,
+ (text_x, text_y),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.4,
+ (255, 255, 255),
+ 1,
+ )
+
+ return viz
diff --git a/dimos/msgs/geometry_msgs/Quaternion.py b/dimos/msgs/geometry_msgs/Quaternion.py
index 9879e1e263..a7bb5543c1 100644
--- a/dimos/msgs/geometry_msgs/Quaternion.py
+++ b/dimos/msgs/geometry_msgs/Quaternion.py
@@ -22,6 +22,7 @@
import numpy as np
from dimos_lcm.geometry_msgs import Quaternion as LCMQuaternion
from plum import dispatch
+from scipy.spatial.transform import Rotation as R
from dimos.msgs.geometry_msgs.Vector3 import Vector3
@@ -117,27 +118,12 @@ def to_euler(self) -> Vector3:
Returns:
Vector3: Euler angles as (roll, pitch, yaw) in radians
"""
- # Convert quaternion to Euler angles using ZYX convention (yaw, pitch, roll)
- # Source: https://en.wikipedia.org/wiki/Conversion_between_quaternions_and_Euler_angles
-
- # Roll (x-axis rotation)
- sinr_cosp = 2 * (self.w * self.x + self.y * self.z)
- cosr_cosp = 1 - 2 * (self.x * self.x + self.y * self.y)
- roll = np.arctan2(sinr_cosp, cosr_cosp)
-
- # Pitch (y-axis rotation)
- sinp = 2 * (self.w * self.y - self.z * self.x)
- if abs(sinp) >= 1:
- pitch = np.copysign(np.pi / 2, sinp) # Use 90 degrees if out of range
- else:
- pitch = np.arcsin(sinp)
-
- # Yaw (z-axis rotation)
- siny_cosp = 2 * (self.w * self.z + self.x * self.y)
- cosy_cosp = 1 - 2 * (self.y * self.y + self.z * self.z)
- yaw = np.arctan2(siny_cosp, cosy_cosp)
+ # Use scipy for accurate quaternion to euler conversion
+ quat = [self.x, self.y, self.z, self.w]
+ rotation = R.from_quat(quat)
+ euler_angles = rotation.as_euler("xyz") # roll, pitch, yaw
- return Vector3(roll, pitch, yaw)
+ return Vector3(euler_angles[0], euler_angles[1], euler_angles[2])
def __getitem__(self, idx: int) -> float:
"""Allow indexing into quaternion components: 0=x, 1=y, 2=z, 3=w."""
diff --git a/dimos/perception/common/utils.py b/dimos/perception/common/utils.py
index da9cc58fe0..10d05d9b4d 100644
--- a/dimos/perception/common/utils.py
+++ b/dimos/perception/common/utils.py
@@ -14,15 +14,113 @@
import cv2
import numpy as np
-from typing import List, Tuple, Optional
+from typing import List, Tuple, Optional, Any, Union
from dimos.types.manipulation import ObjectData
from dimos.types.vector import Vector
from dimos.utils.logging_config import setup_logger
+from dimos_lcm.vision_msgs import Detection3D, Detection2D, BoundingBox2D
import torch
logger = setup_logger("dimos.perception.common.utils")
+def project_3d_points_to_2d(
+ points_3d: np.ndarray, camera_intrinsics: Union[List[float], np.ndarray]
+) -> np.ndarray:
+ """
+ Project 3D points to 2D image coordinates using camera intrinsics.
+
+ Args:
+ points_3d: Nx3 array of 3D points (X, Y, Z)
+ camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix
+
+ Returns:
+ Nx2 array of 2D image coordinates (u, v)
+ """
+ if len(points_3d) == 0:
+ return np.zeros((0, 2), dtype=np.int32)
+
+ # Filter out points with zero or negative depth
+ valid_mask = points_3d[:, 2] > 0
+ if not np.any(valid_mask):
+ return np.zeros((0, 2), dtype=np.int32)
+
+ valid_points = points_3d[valid_mask]
+
+ # Extract camera parameters
+ if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
+ fx, fy, cx, cy = camera_intrinsics
+ else:
+ camera_matrix = np.array(camera_intrinsics)
+ fx = camera_matrix[0, 0]
+ fy = camera_matrix[1, 1]
+ cx = camera_matrix[0, 2]
+ cy = camera_matrix[1, 2]
+
+ # Project to image coordinates
+ u = (valid_points[:, 0] * fx / valid_points[:, 2]) + cx
+ v = (valid_points[:, 1] * fy / valid_points[:, 2]) + cy
+
+ # Round to integer pixel coordinates
+ points_2d = np.column_stack([u, v]).astype(np.int32)
+
+ return points_2d
+
+
+def project_2d_points_to_3d(
+ points_2d: np.ndarray,
+ depth_values: np.ndarray,
+ camera_intrinsics: Union[List[float], np.ndarray],
+) -> np.ndarray:
+ """
+ Project 2D image points to 3D coordinates using depth values and camera intrinsics.
+
+ Args:
+ points_2d: Nx2 array of 2D image coordinates (u, v)
+ depth_values: N-length array of depth values (Z coordinates) for each point
+ camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix
+
+ Returns:
+ Nx3 array of 3D points (X, Y, Z)
+ """
+ if len(points_2d) == 0:
+ return np.zeros((0, 3), dtype=np.float32)
+
+ # Ensure depth_values is a numpy array
+ depth_values = np.asarray(depth_values)
+
+ # Filter out points with zero or negative depth
+ valid_mask = depth_values > 0
+ if not np.any(valid_mask):
+ return np.zeros((0, 3), dtype=np.float32)
+
+ valid_points_2d = points_2d[valid_mask]
+ valid_depths = depth_values[valid_mask]
+
+ # Extract camera parameters
+ if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
+ fx, fy, cx, cy = camera_intrinsics
+ else:
+ camera_matrix = np.array(camera_intrinsics)
+ fx = camera_matrix[0, 0]
+ fy = camera_matrix[1, 1]
+ cx = camera_matrix[0, 2]
+ cy = camera_matrix[1, 2]
+
+ # Back-project to 3D coordinates
+ # X = (u - cx) * Z / fx
+ # Y = (v - cy) * Z / fy
+ # Z = depth
+ X = (valid_points_2d[:, 0] - cx) * valid_depths / fx
+ Y = (valid_points_2d[:, 1] - cy) * valid_depths / fy
+ Z = valid_depths
+
+ # Stack into 3D points
+ points_3d = np.column_stack([X, Y, Z]).astype(np.float32)
+
+ return points_3d
+
+
def colorize_depth(depth_img: np.ndarray, max_depth: float = 5.0) -> Optional[np.ndarray]:
"""
Normalize and colorize depth image using COLORMAP_JET.
@@ -329,3 +427,68 @@ def combine_object_data(
combined.append(obj_copy)
return combined
+
+
+def point_in_bbox(point: Tuple[int, int], bbox: List[float]) -> bool:
+ """
+ Check if a point is inside a bounding box.
+
+ Args:
+ point: (x, y) coordinates
+ bbox: Bounding box [x1, y1, x2, y2]
+
+ Returns:
+ True if point is inside bbox
+ """
+ x, y = point
+ x1, y1, x2, y2 = bbox
+ return x1 <= x <= x2 and y1 <= y <= y2
+
+
+def bbox2d_to_corners(bbox_2d: BoundingBox2D) -> Tuple[float, float, float, float]:
+ """
+ Convert BoundingBox2D from center format to corner format.
+
+ Args:
+ bbox_2d: BoundingBox2D with center and size
+
+ Returns:
+ Tuple of (x1, y1, x2, y2) corner coordinates
+ """
+ center_x = bbox_2d.center.position.x
+ center_y = bbox_2d.center.position.y
+ half_width = bbox_2d.size_x / 2.0
+ half_height = bbox_2d.size_y / 2.0
+
+ x1 = center_x - half_width
+ y1 = center_y - half_height
+ x2 = center_x + half_width
+ y2 = center_y + half_height
+
+ return x1, y1, x2, y2
+
+
+def find_clicked_detection(
+ click_pos: Tuple[int, int], detections_2d: List[Detection2D], detections_3d: List[Detection3D]
+) -> Optional[Detection3D]:
+ """
+ Find which detection was clicked based on 2D bounding boxes.
+
+ Args:
+ click_pos: (x, y) click position
+ detections_2d: List of Detection2D objects
+ detections_3d: List of Detection3D objects (must be 1:1 correspondence)
+
+ Returns:
+ Corresponding Detection3D object if found, None otherwise
+ """
+ click_x, click_y = click_pos
+
+ for i, det_2d in enumerate(detections_2d):
+ if det_2d.bbox and i < len(detections_3d):
+ x1, y1, x2, y2 = bbox2d_to_corners(det_2d.bbox)
+
+ if x1 <= click_x <= x2 and y1 <= click_y <= y2:
+ return detections_3d[i]
+
+ return None
diff --git a/dimos/perception/detection2d/detic_2d_det.py b/dimos/perception/detection2d/detic_2d_det.py
index fc81526ad2..8bc4f9c4b0 100644
--- a/dimos/perception/detection2d/detic_2d_det.py
+++ b/dimos/perception/detection2d/detic_2d_det.py
@@ -15,6 +15,7 @@
import numpy as np
import os
import sys
+from dimos.perception.detection2d.utils import plot_results
# Add Detic to Python path
detic_path = os.path.join(os.path.dirname(__file__), "..", "..", "models", "Detic")
@@ -404,7 +405,6 @@ def visualize_results(self, image, bboxes, track_ids, class_ids, confidences, na
Returns:
Image with visualized detections
"""
- from dimos.perception.detection2d.utils import plot_results
return plot_results(image, bboxes, track_ids, class_ids, confidences, names)
diff --git a/dimos/perception/grasp_generation/utils.py b/dimos/perception/grasp_generation/utils.py
index 94377363f2..ab0cfd0d15 100644
--- a/dimos/perception/grasp_generation/utils.py
+++ b/dimos/perception/grasp_generation/utils.py
@@ -18,49 +18,7 @@
import open3d as o3d
import cv2
from typing import List, Dict, Tuple, Optional, Union
-
-
-def project_3d_points_to_2d(
- points_3d: np.ndarray, camera_intrinsics: Union[List[float], np.ndarray]
-) -> np.ndarray:
- """
- Project 3D points to 2D image coordinates using camera intrinsics.
-
- Args:
- points_3d: Nx3 array of 3D points (X, Y, Z)
- camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix
-
- Returns:
- Nx2 array of 2D image coordinates (u, v)
- """
- if len(points_3d) == 0:
- return np.zeros((0, 2), dtype=np.int32)
-
- # Filter out points with zero or negative depth
- valid_mask = points_3d[:, 2] > 0
- if not np.any(valid_mask):
- return np.zeros((0, 2), dtype=np.int32)
-
- valid_points = points_3d[valid_mask]
-
- # Extract camera parameters
- if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
- fx, fy, cx, cy = camera_intrinsics
- else:
- camera_matrix = np.array(camera_intrinsics)
- fx = camera_matrix[0, 0]
- fy = camera_matrix[1, 1]
- cx = camera_matrix[0, 2]
- cy = camera_matrix[1, 2]
-
- # Project to image coordinates
- u = (valid_points[:, 0] * fx / valid_points[:, 2]) + cx
- v = (valid_points[:, 1] * fy / valid_points[:, 2]) + cy
-
- # Round to integer pixel coordinates
- points_2d = np.column_stack([u, v]).astype(np.int32)
-
- return points_2d
+from dimos.perception.common.utils import project_3d_points_to_2d, project_2d_points_to_3d
def create_gripper_geometry(
diff --git a/dimos/perception/pointcloud/utils.py b/dimos/perception/pointcloud/utils.py
index 0813c2ca0e..b3c395bfa3 100644
--- a/dimos/perception/pointcloud/utils.py
+++ b/dimos/perception/pointcloud/utils.py
@@ -24,8 +24,9 @@
import os
import cv2
import open3d as o3d
-from typing import List, Optional, Tuple, Union, Dict
+from typing import List, Optional, Tuple, Union, Dict, Any
from scipy.spatial import cKDTree
+from dimos.perception.common.utils import project_3d_points_to_2d
def load_camera_matrix_from_yaml(
@@ -304,48 +305,6 @@ def filter_point_cloud_radius(
return pcd.remove_radius_outlier(nb_points=nb_points, radius=radius)
-def project_3d_points_to_2d(
- points_3d: np.ndarray, camera_intrinsics: Union[List[float], np.ndarray]
-) -> np.ndarray:
- """
- Project 3D points to 2D image coordinates using camera intrinsics.
-
- Args:
- points_3d: Nx3 array of 3D points (X, Y, Z)
- camera_intrinsics: Camera parameters as [fx, fy, cx, cy] list or 3x3 matrix
-
- Returns:
- Nx2 array of 2D image coordinates (u, v)
- """
- if len(points_3d) == 0:
- return np.zeros((0, 2), dtype=np.int32)
-
- # Filter out points with zero or negative depth
- valid_mask = points_3d[:, 2] > 0
- if not np.any(valid_mask):
- return np.zeros((0, 2), dtype=np.int32)
-
- valid_points = points_3d[valid_mask]
-
- # Extract camera parameters
- if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
- fx, fy, cx, cy = camera_intrinsics
- else:
- fx = camera_intrinsics[0, 0]
- fy = camera_intrinsics[1, 1]
- cx = camera_intrinsics[0, 2]
- cy = camera_intrinsics[1, 2]
-
- # Project to image coordinates
- u = (valid_points[:, 0] * fx / valid_points[:, 2]) + cx
- v = (valid_points[:, 1] * fy / valid_points[:, 2]) + cy
-
- # Round to integer pixel coordinates
- points_2d = np.column_stack([u, v]).astype(np.int32)
-
- return points_2d
-
-
def overlay_point_clouds_on_image(
base_image: np.ndarray,
point_clouds: List[o3d.geometry.PointCloud],
@@ -1080,3 +1039,73 @@ def combine_object_pointclouds(
combined_pcd.colors = o3d.utility.Vector3dVector(np.vstack(all_colors))
return combined_pcd
+
+
+def extract_centroids_from_masks(
+ rgb_image: np.ndarray,
+ depth_image: np.ndarray,
+ masks: List[np.ndarray],
+ camera_intrinsics: Union[List[float], np.ndarray],
+) -> List[Dict[str, Any]]:
+ """
+ Extract 3D centroids and orientations from segmentation masks.
+
+ Args:
+ rgb_image: RGB image (H, W, 3)
+ depth_image: Depth image (H, W) in meters
+ masks: List of boolean masks (H, W)
+ camera_intrinsics: Camera parameters as [fx, fy, cx, cy] or 3x3 matrix
+
+ Returns:
+ List of dictionaries containing:
+ - centroid: 3D centroid position [x, y, z] in camera frame
+ - orientation: Normalized direction vector from camera to centroid
+ - num_points: Number of valid 3D points
+ - mask_idx: Index of the mask in the input list
+ """
+ # Extract camera parameters
+ if isinstance(camera_intrinsics, list) and len(camera_intrinsics) == 4:
+ fx, fy, cx, cy = camera_intrinsics
+ else:
+ fx = camera_intrinsics[0, 0]
+ fy = camera_intrinsics[1, 1]
+ cx = camera_intrinsics[0, 2]
+ cy = camera_intrinsics[1, 2]
+
+ results = []
+
+ for mask_idx, mask in enumerate(masks):
+ if mask is None or mask.sum() == 0:
+ continue
+
+ # Get pixel coordinates where mask is True
+ y_coords, x_coords = np.where(mask)
+
+ # Get depth values at mask locations
+ depths = depth_image[y_coords, x_coords]
+
+ # Convert to 3D points in camera frame
+ X = (x_coords - cx) * depths / fx
+ Y = (y_coords - cy) * depths / fy
+ Z = depths
+
+ # Calculate centroid
+ centroid_x = np.mean(X)
+ centroid_y = np.mean(Y)
+ centroid_z = np.mean(Z)
+ centroid = np.array([centroid_x, centroid_y, centroid_z])
+
+ # Calculate orientation as normalized direction from camera origin to centroid
+ # Camera origin is at (0, 0, 0)
+ orientation = centroid / np.linalg.norm(centroid)
+
+ results.append(
+ {
+ "centroid": centroid,
+ "orientation": orientation,
+ "num_points": int(mask.sum()),
+ "mask_idx": mask_idx,
+ }
+ )
+
+ return results
diff --git a/dimos/perception/segmentation/sam_2d_seg.py b/dimos/perception/segmentation/sam_2d_seg.py
index fcf27584e6..cb2acaf076 100644
--- a/dimos/perception/segmentation/sam_2d_seg.py
+++ b/dimos/perception/segmentation/sam_2d_seg.py
@@ -42,13 +42,12 @@ def __init__(
self,
model_path="models_fastsam",
model_name="FastSAM-s.onnx",
- device="cpu",
min_analysis_interval=5.0,
use_tracker=True,
use_analyzer=True,
use_rich_labeling=False,
+ use_filtering=True,
):
- self.device = device
if is_cuda_available():
logger.info("Using CUDA for SAM 2d segmenter")
if hasattr(onnxruntime, "preload_dlls"): # Handles CUDA 11 / onnxruntime-gpu<=1.18
@@ -62,6 +61,7 @@ def __init__(
self.use_tracker = use_tracker
self.use_analyzer = use_analyzer
self.use_rich_labeling = use_rich_labeling
+ self.use_filtering = use_filtering
module_dir = os.path.dirname(__file__)
self.tracker_config = os.path.join(module_dir, "config", "custom_tracker.yaml")
@@ -98,11 +98,10 @@ def process_image(self, image):
source=image,
device=self.device,
retina_masks=True,
- conf=0.6,
- iou=0.9,
+ conf=0.3,
+ iou=0.5,
persist=True,
verbose=False,
- tracker=self.tracker_config,
)
if len(results) > 0:
@@ -112,14 +111,25 @@ def process_image(self, image):
)
# Filter results
- (
- filtered_masks,
- filtered_bboxes,
- filtered_track_ids,
- filtered_probs,
- filtered_names,
- filtered_texture_values,
- ) = filter_segmentation_results(image, masks, bboxes, track_ids, probs, names, areas)
+ if self.use_filtering:
+ (
+ filtered_masks,
+ filtered_bboxes,
+ filtered_track_ids,
+ filtered_probs,
+ filtered_names,
+ filtered_texture_values,
+ ) = filter_segmentation_results(
+ image, masks, bboxes, track_ids, probs, names, areas
+ )
+ else:
+ # Use original results without filtering
+ filtered_masks = masks
+ filtered_bboxes = bboxes
+ filtered_track_ids = track_ids
+ filtered_probs = probs
+ filtered_names = names
+ filtered_texture_values = []
if self.use_tracker:
# Update tracker with filtered results
diff --git a/dimos/robot/agilex/README.md b/dimos/robot/agilex/README.md
new file mode 100644
index 0000000000..1e678cae65
--- /dev/null
+++ b/dimos/robot/agilex/README.md
@@ -0,0 +1,371 @@
+# DIMOS Manipulator Robot Development Guide
+
+This guide explains how to create robot classes, integrate agents, and use the DIMOS module system with LCM transport.
+
+## Table of Contents
+1. [Robot Class Architecture](#robot-class-architecture)
+2. [Module System & LCM Transport](#module-system--lcm-transport)
+3. [Agent Integration](#agent-integration)
+4. [Complete Example](#complete-example)
+
+## Robot Class Architecture
+
+### Basic Robot Class Structure
+
+A DIMOS robot class should follow this pattern:
+
+```python
+from typing import Optional, List
+from dimos import core
+from dimos.types.robot_capabilities import RobotCapability
+
+class YourRobot:
+ """Your robot implementation."""
+
+ def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None):
+ # Core components
+ self.dimos = None
+ self.modules = {}
+ self.skill_library = SkillLibrary()
+
+ # Define capabilities
+ self.capabilities = robot_capabilities or [
+ RobotCapability.VISION,
+ RobotCapability.MANIPULATION,
+ ]
+
+ async def start(self):
+ """Start the robot modules."""
+ # Initialize DIMOS with worker count
+ self.dimos = core.start(2) # Number of workers needed
+
+ # Deploy modules
+ # ... (see Module System section)
+
+ def stop(self):
+ """Stop all modules and clean up."""
+ # Stop modules
+ # Close DIMOS
+ if self.dimos:
+ self.dimos.close()
+```
+
+### Key Components Explained
+
+1. **Initialization**: Store references to modules, skills, and capabilities
+2. **Async Start**: Modules must be deployed asynchronously
+3. **Proper Cleanup**: Always stop modules before closing DIMOS
+
+## Module System & LCM Transport
+
+### Understanding DIMOS Modules
+
+Modules are the building blocks of DIMOS robots. They:
+- Process data streams (inputs)
+- Produce outputs
+- Can be connected together
+- Communicate via LCM (Lightweight Communications and Marshalling)
+
+### Deploying a Module
+
+```python
+# Deploy a camera module
+self.camera = self.dimos.deploy(
+ ZEDModule, # Module class
+ camera_id=0, # Module parameters
+ resolution="HD720",
+ depth_mode="NEURAL",
+ fps=30,
+ publish_rate=30.0,
+ frame_id="camera_frame"
+)
+```
+
+### Setting Up LCM Transport
+
+LCM transport enables inter-module communication:
+
+```python
+# Enable LCM auto-configuration
+from dimos.protocol import pubsub
+pubsub.lcm.autoconf()
+
+# Configure output transport
+self.camera.color_image.transport = core.LCMTransport(
+ "/camera/color_image", # Topic name
+ Image # Message type
+)
+self.camera.depth_image.transport = core.LCMTransport(
+ "/camera/depth_image",
+ Image
+)
+```
+
+### Connecting Modules
+
+Connect module outputs to inputs:
+
+```python
+# Connect manipulation module to camera outputs
+self.manipulation.rgb_image.connect(self.camera.color_image)
+self.manipulation.depth_image.connect(self.camera.depth_image)
+self.manipulation.camera_info.connect(self.camera.camera_info)
+```
+
+### Module Communication Pattern
+
+```
+┌──────────────┐ LCM ┌────────────────┐ LCM ┌──────────────┐
+│ Camera │────────▶│ Manipulation │────────▶│ Visualization│
+│ Module │ Messages│ Module │ Messages│ Output │
+└──────────────┘ └────────────────┘ └──────────────┘
+ ▲ ▲
+ │ │
+ └──────────────────────────┘
+ Direct Connection via RPC call
+```
+
+## Agent Integration
+
+### Setting Up Agent with Robot
+
+The run file pattern for agent integration:
+
+```python
+#!/usr/bin/env python3
+import asyncio
+import reactivex as rx
+from dimos.agents.claude_agent import ClaudeAgent
+from dimos.web.robot_web_interface import RobotWebInterface
+
+def main():
+ # 1. Create and start robot
+ robot = YourRobot()
+ asyncio.run(robot.start())
+
+ # 2. Set up skills
+ skills = robot.get_skills()
+ skills.add(YourSkill)
+ skills.create_instance("YourSkill", robot=robot)
+
+ # 3. Set up reactive streams
+ agent_response_subject = rx.subject.Subject()
+ agent_response_stream = agent_response_subject.pipe(ops.share())
+
+ # 4. Create web interface
+ web_interface = RobotWebInterface(
+ port=5555,
+ text_streams={"agent_responses": agent_response_stream},
+ audio_subject=rx.subject.Subject()
+ )
+
+ # 5. Create agent
+ agent = ClaudeAgent(
+ dev_name="your_agent",
+ input_query_stream=web_interface.query_stream,
+ skills=skills,
+ system_query="Your system prompt here",
+ model_name="claude-3-5-haiku-latest"
+ )
+
+ # 6. Connect agent responses
+ agent.get_response_observable().subscribe(
+ lambda x: agent_response_subject.on_next(x)
+ )
+
+ # 7. Run interface
+ web_interface.run()
+```
+
+### Key Integration Points
+
+1. **Reactive Streams**: Use RxPy for event-driven communication
+2. **Web Interface**: Provides user input/output
+3. **Agent**: Processes natural language and executes skills
+4. **Skills**: Define robot capabilities as executable actions
+
+## Complete Example
+
+### Step 1: Create Robot Class (`my_robot.py`)
+
+```python
+import asyncio
+from typing import Optional, List
+from dimos import core
+from dimos.hardware.camera import CameraModule
+from dimos.manipulation.module import ManipulationModule
+from dimos.skills.skills import SkillLibrary
+from dimos.types.robot_capabilities import RobotCapability
+from dimos_lcm.sensor_msgs import Image, CameraInfo
+from dimos.protocol import pubsub
+
+class MyRobot:
+ def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None):
+ self.dimos = None
+ self.camera = None
+ self.manipulation = None
+ self.skill_library = SkillLibrary()
+
+ self.capabilities = robot_capabilities or [
+ RobotCapability.VISION,
+ RobotCapability.MANIPULATION,
+ ]
+
+ async def start(self):
+ # Start DIMOS
+ self.dimos = core.start(2)
+
+ # Enable LCM
+ pubsub.lcm.autoconf()
+
+ # Deploy camera
+ self.camera = self.dimos.deploy(
+ CameraModule,
+ camera_id=0,
+ fps=30
+ )
+
+ # Configure camera LCM
+ self.camera.color_image.transport = core.LCMTransport("/camera/rgb", Image)
+ self.camera.depth_image.transport = core.LCMTransport("/camera/depth", Image)
+ self.camera.camera_info.transport = core.LCMTransport("/camera/info", CameraInfo)
+
+ # Deploy manipulation
+ self.manipulation = self.dimos.deploy(ManipulationModule)
+
+ # Connect modules
+ self.manipulation.rgb_image.connect(self.camera.color_image)
+ self.manipulation.depth_image.connect(self.camera.depth_image)
+ self.manipulation.camera_info.connect(self.camera.camera_info)
+
+ # Configure manipulation output
+ self.manipulation.viz_image.transport = core.LCMTransport("/viz/output", Image)
+
+ # Start modules
+ self.camera.start()
+ self.manipulation.start()
+
+ await asyncio.sleep(2) # Allow initialization
+
+ def get_skills(self):
+ return self.skill_library
+
+ def stop(self):
+ if self.manipulation:
+ self.manipulation.stop()
+ if self.camera:
+ self.camera.stop()
+ if self.dimos:
+ self.dimos.close()
+```
+
+### Step 2: Create Run Script (`run.py`)
+
+```python
+#!/usr/bin/env python3
+import asyncio
+import os
+from my_robot import MyRobot
+from dimos.agents.claude_agent import ClaudeAgent
+from dimos.skills.basic import BasicSkill
+from dimos.web.robot_web_interface import RobotWebInterface
+import reactivex as rx
+import reactivex.operators as ops
+
+SYSTEM_PROMPT = """You are a helpful robot assistant."""
+
+def main():
+ # Check API key
+ if not os.getenv("ANTHROPIC_API_KEY"):
+ print("Please set ANTHROPIC_API_KEY")
+ return
+
+ # Create robot
+ robot = MyRobot()
+
+ try:
+ # Start robot
+ asyncio.run(robot.start())
+
+ # Set up skills
+ skills = robot.get_skills()
+ skills.add(BasicSkill)
+ skills.create_instance("BasicSkill", robot=robot)
+
+ # Set up streams
+ agent_response_subject = rx.subject.Subject()
+ agent_response_stream = agent_response_subject.pipe(ops.share())
+
+ # Create web interface
+ web_interface = RobotWebInterface(
+ port=5555,
+ text_streams={"agent_responses": agent_response_stream}
+ )
+
+ # Create agent
+ agent = ClaudeAgent(
+ dev_name="my_agent",
+ input_query_stream=web_interface.query_stream,
+ skills=skills,
+ system_query=SYSTEM_PROMPT
+ )
+
+ # Connect responses
+ agent.get_response_observable().subscribe(
+ lambda x: agent_response_subject.on_next(x)
+ )
+
+ print("Robot ready at http://localhost:5555")
+
+ # Run
+ web_interface.run()
+
+ finally:
+ robot.stop()
+
+if __name__ == "__main__":
+ main()
+```
+
+### Step 3: Define Skills (`skills.py`)
+
+```python
+from dimos.skills import Skill, skill
+
+@skill(
+ description="Perform a basic action",
+ parameters={
+ "action": "The action to perform"
+ }
+)
+class BasicSkill(Skill):
+ def __init__(self, robot):
+ self.robot = robot
+
+ def run(self, action: str):
+ # Implement skill logic
+ return f"Performed: {action}"
+```
+
+## Best Practices
+
+1. **Module Lifecycle**: Always start DIMOS before deploying modules
+2. **LCM Topics**: Use descriptive topic names with namespaces
+3. **Error Handling**: Wrap module operations in try-except blocks
+4. **Resource Cleanup**: Ensure proper cleanup in stop() methods
+5. **Async Operations**: Use asyncio for non-blocking operations
+6. **Stream Management**: Use RxPy for reactive programming patterns
+
+## Debugging Tips
+
+1. **Check Module Status**: Print module.io().result() to see connections
+2. **Monitor LCM**: Use Foxglove to visualize LCM messages
+3. **Log Everything**: Use dimos.utils.logging_config.setup_logger()
+4. **Test Modules Independently**: Deploy and test one module at a time
+
+## Common Issues
+
+1. **"Module not started"**: Ensure start() is called after deployment
+2. **"No data received"**: Check LCM transport configuration
+3. **"Connection failed"**: Verify input/output types match
+4. **"Cleanup errors"**: Stop modules before closing DIMOS
\ No newline at end of file
diff --git a/dimos/robot/agilex/README_CN.md b/dimos/robot/agilex/README_CN.md
new file mode 100644
index 0000000000..482a09dd6d
--- /dev/null
+++ b/dimos/robot/agilex/README_CN.md
@@ -0,0 +1,465 @@
+# DIMOS 机械臂机器人开发指南
+
+本指南介绍如何创建机器人类、集成智能体(Agent)以及使用 DIMOS 模块系统和 LCM 传输。
+
+## 目录
+1. [机器人类架构](#机器人类架构)
+2. [模块系统与 LCM 传输](#模块系统与-lcm-传输)
+3. [智能体集成](#智能体集成)
+4. [完整示例](#完整示例)
+
+## 机器人类架构
+
+### 基本机器人类结构
+
+DIMOS 机器人类应遵循以下模式:
+
+```python
+from typing import Optional, List
+from dimos import core
+from dimos.types.robot_capabilities import RobotCapability
+
+class YourRobot:
+ """您的机器人实现。"""
+
+ def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None):
+ # 核心组件
+ self.dimos = None
+ self.modules = {}
+ self.skill_library = SkillLibrary()
+
+ # 定义能力
+ self.capabilities = robot_capabilities or [
+ RobotCapability.VISION,
+ RobotCapability.MANIPULATION,
+ ]
+
+ async def start(self):
+ """启动机器人模块。"""
+ # 初始化 DIMOS,指定工作线程数
+ self.dimos = core.start(2) # 需要的工作线程数
+
+ # 部署模块
+ # ... (参见模块系统章节)
+
+ def stop(self):
+ """停止所有模块并清理资源。"""
+ # 停止模块
+ # 关闭 DIMOS
+ if self.dimos:
+ self.dimos.close()
+```
+
+### 关键组件说明
+
+1. **初始化**:存储模块、技能和能力的引用
+2. **异步启动**:模块必须异步部署
+3. **正确清理**:在关闭 DIMOS 之前始终停止模块
+
+## 模块系统与 LCM 传输
+
+### 理解 DIMOS 模块
+
+模块是 DIMOS 机器人的构建块。它们:
+- 处理数据流(输入)
+- 产生输出
+- 可以相互连接
+- 通过 LCM(轻量级通信和编组)进行通信
+
+### 部署模块
+
+```python
+# 部署相机模块
+self.camera = self.dimos.deploy(
+ ZEDModule, # 模块类
+ camera_id=0, # 模块参数
+ resolution="HD720",
+ depth_mode="NEURAL",
+ fps=30,
+ publish_rate=30.0,
+ frame_id="camera_frame"
+)
+```
+
+### 设置 LCM 传输
+
+LCM 传输实现模块间通信:
+
+```python
+# 启用 LCM 自动配置
+from dimos.protocol import pubsub
+pubsub.lcm.autoconf()
+
+# 配置输出传输
+self.camera.color_image.transport = core.LCMTransport(
+ "/camera/color_image", # 主题名称
+ Image # 消息类型
+)
+self.camera.depth_image.transport = core.LCMTransport(
+ "/camera/depth_image",
+ Image
+)
+```
+
+### 连接模块
+
+将模块输出连接到输入:
+
+```python
+# 将操作模块连接到相机输出
+self.manipulation.rgb_image.connect(self.camera.color_image) # ROS set_callback
+self.manipulation.depth_image.connect(self.camera.depth_image)
+self.manipulation.camera_info.connect(self.camera.camera_info)
+```
+
+### 模块通信模式
+
+```
+┌──────────────┐ LCM ┌────────────────┐ LCM ┌──────────────┐
+│ 相机模块 │────────▶│ 操作模块 │────────▶│ 可视化输出 │
+│ │ 消息 │ │ 消息 │ │
+└──────────────┘ └────────────────┘ └──────────────┘
+ ▲ ▲
+ │ │
+ └──────────────────────────┘
+ 直接连接(RPC指令)
+```
+
+## 智能体集成
+
+### 设置智能体与机器人
+
+运行文件的智能体集成模式:
+
+```python
+#!/usr/bin/env python3
+import asyncio
+import reactivex as rx
+from dimos.agents.claude_agent import ClaudeAgent
+from dimos.web.robot_web_interface import RobotWebInterface
+
+def main():
+ # 1. 创建并启动机器人
+ robot = YourRobot()
+ asyncio.run(robot.start())
+
+ # 2. 设置技能
+ skills = robot.get_skills()
+ skills.add(YourSkill)
+ skills.create_instance("YourSkill", robot=robot)
+
+ # 3. 设置响应式流
+ agent_response_subject = rx.subject.Subject()
+ agent_response_stream = agent_response_subject.pipe(ops.share())
+
+ # 4. 创建 Web 界面
+ web_interface = RobotWebInterface(
+ port=5555,
+ text_streams={"agent_responses": agent_response_stream},
+ audio_subject=rx.subject.Subject()
+ )
+
+ # 5. 创建智能体
+ agent = ClaudeAgent(
+ dev_name="your_agent",
+ input_query_stream=web_interface.query_stream,
+ skills=skills,
+ system_query="您的系统提示词",
+ model_name="claude-3-5-haiku-latest"
+ )
+
+ # 6. 连接智能体响应
+ agent.get_response_observable().subscribe(
+ lambda x: agent_response_subject.on_next(x)
+ )
+
+ # 7. 运行界面
+ web_interface.run()
+```
+
+### 关键集成点
+
+1. **响应式流**:使用 RxPy 进行事件驱动通信
+2. **Web 界面**:提供用户输入/输出
+3. **智能体**:处理自然语言并执行技能
+4. **技能**:将机器人能力定义为可执行动作
+
+## 完整示例
+
+### 步骤 1:创建机器人类(`my_robot.py`)
+
+```python
+import asyncio
+from typing import Optional, List
+from dimos import core
+from dimos.hardware.camera import CameraModule
+from dimos.manipulation.module import ManipulationModule
+from dimos.skills.skills import SkillLibrary
+from dimos.types.robot_capabilities import RobotCapability
+from dimos_lcm.sensor_msgs import Image, CameraInfo
+from dimos.protocol import pubsub
+
+class MyRobot:
+ def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None):
+ self.dimos = None
+ self.camera = None
+ self.manipulation = None
+ self.skill_library = SkillLibrary()
+
+ self.capabilities = robot_capabilities or [
+ RobotCapability.VISION,
+ RobotCapability.MANIPULATION,
+ ]
+
+ async def start(self):
+ # 启动 DIMOS
+ self.dimos = core.start(2)
+
+ # 启用 LCM
+ pubsub.lcm.autoconf()
+
+ # 部署相机
+ self.camera = self.dimos.deploy(
+ CameraModule,
+ camera_id=0,
+ fps=30
+ )
+
+ # 配置相机 LCM
+ self.camera.color_image.transport = core.LCMTransport("/camera/rgb", Image)
+ self.camera.depth_image.transport = core.LCMTransport("/camera/depth", Image)
+ self.camera.camera_info.transport = core.LCMTransport("/camera/info", CameraInfo)
+
+ # 部署操作模块
+ self.manipulation = self.dimos.deploy(ManipulationModule)
+
+ # 连接模块
+ self.manipulation.rgb_image.connect(self.camera.color_image)
+ self.manipulation.depth_image.connect(self.camera.depth_image)
+ self.manipulation.camera_info.connect(self.camera.camera_info)
+
+ # 配置操作输出
+ self.manipulation.viz_image.transport = core.LCMTransport("/viz/output", Image)
+
+ # 启动模块
+ self.camera.start()
+ self.manipulation.start()
+
+ await asyncio.sleep(2) # 允许初始化
+
+ def get_skills(self):
+ return self.skill_library
+
+ def stop(self):
+ if self.manipulation:
+ self.manipulation.stop()
+ if self.camera:
+ self.camera.stop()
+ if self.dimos:
+ self.dimos.close()
+```
+
+### 步骤 2:创建运行脚本(`run.py`)
+
+```python
+#!/usr/bin/env python3
+import asyncio
+import os
+from my_robot import MyRobot
+from dimos.agents.claude_agent import ClaudeAgent
+from dimos.skills.basic import BasicSkill
+from dimos.web.robot_web_interface import RobotWebInterface
+import reactivex as rx
+import reactivex.operators as ops
+
+SYSTEM_PROMPT = """您是一个有用的机器人助手。"""
+
+def main():
+ # 检查 API 密钥
+ if not os.getenv("ANTHROPIC_API_KEY"):
+ print("请设置 ANTHROPIC_API_KEY")
+ return
+
+ # 创建机器人
+ robot = MyRobot()
+
+ try:
+ # 启动机器人
+ asyncio.run(robot.start())
+
+ # 设置技能
+ skills = robot.get_skills()
+ skills.add(BasicSkill)
+ skills.create_instance("BasicSkill", robot=robot)
+
+ # 设置流
+ agent_response_subject = rx.subject.Subject()
+ agent_response_stream = agent_response_subject.pipe(ops.share())
+
+ # 创建 Web 界面
+ web_interface = RobotWebInterface(
+ port=5555,
+ text_streams={"agent_responses": agent_response_stream}
+ )
+
+ # 创建智能体
+ agent = ClaudeAgent(
+ dev_name="my_agent",
+ input_query_stream=web_interface.query_stream,
+ skills=skills,
+ system_query=SYSTEM_PROMPT
+ )
+
+ # 连接响应
+ agent.get_response_observable().subscribe(
+ lambda x: agent_response_subject.on_next(x)
+ )
+
+ print("机器人就绪,访问 http://localhost:5555")
+
+ # 运行
+ web_interface.run()
+
+ finally:
+ robot.stop()
+
+if __name__ == "__main__":
+ main()
+```
+
+### 步骤 3:定义技能(`skills.py`)
+
+```python
+from dimos.skills import Skill, skill
+
+@skill(
+ description="执行一个基本动作",
+ parameters={
+ "action": "要执行的动作"
+ }
+)
+class BasicSkill(Skill):
+ def __init__(self, robot):
+ self.robot = robot
+
+ def run(self, action: str):
+ # 实现技能逻辑
+ return f"已执行:{action}"
+```
+
+## 最佳实践
+
+1. **模块生命周期**:在部署模块之前始终先启动 DIMOS
+2. **LCM 主题**:使用带命名空间的描述性主题名称
+3. **错误处理**:用 try-except 块包装模块操作
+4. **资源清理**:确保在 stop() 方法中正确清理
+5. **异步操作**:使用 asyncio 进行非阻塞操作
+6. **流管理**:使用 RxPy 进行响应式编程模式
+
+## 调试技巧
+
+1. **检查模块状态**:打印 module.io().result() 查看连接
+2. **监控 LCM**:使用 Foxglove 可视化 LCM 消息
+3. **记录一切**:使用 dimos.utils.logging_config.setup_logger()
+4. **独立测试模块**:一次部署和测试一个模块
+
+## 常见问题
+
+1. **"模块未启动"**:确保在部署后调用 start()
+2. **"未收到数据"**:检查 LCM 传输配置
+3. **"连接失败"**:验证输入/输出类型是否匹配
+4. **"清理错误"**:在关闭 DIMOS 之前停止模块
+
+## 高级主题
+
+### 自定义模块开发
+
+创建自定义模块的基本结构:
+
+```python
+from dimos.core import Module, In, Out, rpc
+
+class CustomModule(Module):
+ # 定义输入
+ input_data: In[DataType] = None
+
+ # 定义输出
+ output_data: Out[DataType] = None
+
+ def __init__(self, param1, param2, **kwargs):
+ super().__init__(**kwargs)
+ self.param1 = param1
+ self.param2 = param2
+
+ @rpc
+ def start(self):
+ """启动模块处理。"""
+ self.input_data.subscribe(self._process_data)
+
+ def _process_data(self, data):
+ """处理输入数据。"""
+ # 处理逻辑
+ result = self.process(data)
+ # 发布输出
+ self.output_data.publish(result)
+
+ @rpc
+ def stop(self):
+ """停止模块。"""
+ # 清理资源
+ pass
+```
+
+### 技能开发指南
+
+技能是机器人可执行的高级动作:
+
+```python
+from dimos.skills import Skill, skill
+from typing import Optional
+
+@skill(
+ description="复杂操作技能",
+ parameters={
+ "target": "目标对象",
+ "location": "目标位置"
+ }
+)
+class ComplexSkill(Skill):
+ def __init__(self, robot, **kwargs):
+ super().__init__(**kwargs)
+ self.robot = robot
+
+ def run(self, target: str, location: Optional[str] = None):
+ """执行技能逻辑。"""
+ try:
+ # 1. 感知阶段
+ object_info = self.robot.detect_object(target)
+
+ # 2. 规划阶段
+ if location:
+ plan = self.robot.plan_movement(object_info, location)
+
+ # 3. 执行阶段
+ result = self.robot.execute_plan(plan)
+
+ return {
+ "success": True,
+ "message": f"成功移动 {target} 到 {location}"
+ }
+
+ except Exception as e:
+ return {
+ "success": False,
+ "error": str(e)
+ }
+```
+
+### 性能优化
+
+1. **并行处理**:使用多个工作线程处理不同模块
+2. **数据缓冲**:为高频数据流实现缓冲机制
+3. **延迟加载**:仅在需要时初始化重型模块
+4. **资源池化**:重用昂贵的资源(如神经网络模型)
+
+希望本指南能帮助您快速上手 DIMOS 机器人开发!
\ No newline at end of file
diff --git a/dimos/robot/agilex/piper_arm.py b/dimos/robot/agilex/piper_arm.py
new file mode 100644
index 0000000000..63dc419a78
--- /dev/null
+++ b/dimos/robot/agilex/piper_arm.py
@@ -0,0 +1,201 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+from typing import Optional, List
+
+from dimos import core
+from dimos.hardware.zed_camera import ZEDModule
+from dimos.manipulation.visual_servoing.manipulation_module import ManipulationModule
+from dimos_lcm.sensor_msgs import Image
+from dimos.protocol import pubsub
+from dimos.skills.skills import SkillLibrary
+from dimos.types.robot_capabilities import RobotCapability
+from dimos.robot.foxglove_bridge import FoxgloveBridge
+from dimos.utils.logging_config import setup_logger
+
+# Import LCM message types
+from dimos_lcm.sensor_msgs import CameraInfo
+
+logger = setup_logger("dimos.robot.agilex.piper_arm")
+
+
+class PiperArmRobot:
+ """Piper Arm robot with ZED camera and manipulation capabilities."""
+
+ def __init__(self, robot_capabilities: Optional[List[RobotCapability]] = None):
+ self.dimos = None
+ self.stereo_camera = None
+ self.manipulation_interface = None
+ self.skill_library = SkillLibrary()
+
+ # Initialize capabilities
+ self.capabilities = robot_capabilities or [
+ RobotCapability.VISION,
+ RobotCapability.MANIPULATION,
+ ]
+
+ async def start(self):
+ """Start the robot modules."""
+ # Start Dimos
+ self.dimos = core.start(2) # Need 2 workers for ZED and manipulation modules
+ self.foxglove_bridge = FoxgloveBridge()
+
+ # Enable LCM auto-configuration
+ pubsub.lcm.autoconf()
+
+ # Deploy ZED module
+ logger.info("Deploying ZED module...")
+ self.stereo_camera = self.dimos.deploy(
+ ZEDModule,
+ camera_id=0,
+ resolution="HD720",
+ depth_mode="NEURAL",
+ fps=30,
+ enable_tracking=False, # We don't need tracking for manipulation
+ publish_rate=30.0,
+ frame_id="zed_camera",
+ )
+
+ # Configure ZED LCM transports
+ self.stereo_camera.color_image.transport = core.LCMTransport("/zed/color_image", Image)
+ self.stereo_camera.depth_image.transport = core.LCMTransport("/zed/depth_image", Image)
+ self.stereo_camera.camera_info.transport = core.LCMTransport("/zed/camera_info", CameraInfo)
+
+ # Deploy manipulation module
+ logger.info("Deploying manipulation module...")
+ self.manipulation_interface = self.dimos.deploy(ManipulationModule)
+
+ # Connect manipulation inputs to ZED outputs
+ self.manipulation_interface.rgb_image.connect(self.stereo_camera.color_image)
+ self.manipulation_interface.depth_image.connect(self.stereo_camera.depth_image)
+ self.manipulation_interface.camera_info.connect(self.stereo_camera.camera_info)
+
+ # Configure manipulation output
+ self.manipulation_interface.viz_image.transport = core.LCMTransport(
+ "/manipulation/viz", Image
+ )
+
+ # Print module info
+ logger.info("Modules configured:")
+ print("\nZED Module:")
+ print(self.stereo_camera.io().result())
+ print("\nManipulation Module:")
+ print(self.manipulation_interface.io().result())
+
+ # Start modules
+ logger.info("Starting modules...")
+ self.foxglove_bridge.start()
+ self.stereo_camera.start()
+ self.manipulation_interface.start()
+
+ # Give modules time to initialize
+ await asyncio.sleep(2)
+
+ logger.info("PiperArmRobot initialized and started")
+
+ def get_skills(self):
+ """Get the robot's skill library.
+
+ Returns:
+ The robot's skill library for adding/managing skills
+ """
+ return self.skill_library
+
+ def pick_and_place(
+ self, pick_x: int, pick_y: int, place_x: Optional[int] = None, place_y: Optional[int] = None
+ ):
+ """Execute pick and place task.
+
+ Args:
+ pick_x: X coordinate for pick location
+ pick_y: Y coordinate for pick location
+ place_x: X coordinate for place location (optional)
+ place_y: Y coordinate for place location (optional)
+
+ Returns:
+ Result of the pick and place operation
+ """
+ if self.manipulation_interface:
+ return self.manipulation_interface.pick_and_place(pick_x, pick_y, place_x, place_y)
+ else:
+ logger.error("Manipulation module not initialized")
+ return False
+
+ def handle_keyboard_command(self, key: str):
+ """Pass keyboard commands to manipulation module.
+
+ Args:
+ key: Keyboard key pressed
+
+ Returns:
+ Action taken or None
+ """
+ if self.manipulation_interface:
+ return self.manipulation_interface.handle_keyboard_command(key)
+ else:
+ logger.error("Manipulation module not initialized")
+ return None
+
+ def has_capability(self, capability: RobotCapability) -> bool:
+ """Check if the robot has a specific capability.
+
+ Args:
+ capability: The capability to check for
+
+ Returns:
+ bool: True if the robot has the capability
+ """
+ return capability in self.capabilities
+
+ def stop(self):
+ """Stop all modules and clean up."""
+ logger.info("Stopping PiperArmRobot...")
+
+ try:
+ if self.manipulation_interface:
+ self.manipulation_interface.stop()
+ self.manipulation_interface.cleanup()
+
+ if self.stereo_camera:
+ self.stereo_camera.stop()
+ except Exception as e:
+ logger.warning(f"Error stopping modules: {e}")
+
+ # Close dimos last to ensure workers are available for cleanup
+ if self.dimos:
+ self.dimos.close()
+
+ logger.info("PiperArmRobot stopped")
+
+
+async def run_piper_arm():
+ """Run the Piper Arm robot."""
+ robot = PiperArmRobot()
+
+ await robot.start()
+
+ # Keep the robot running
+ try:
+ while True:
+ await asyncio.sleep(1)
+ except KeyboardInterrupt:
+ logger.info("Keyboard interrupt received")
+ finally:
+ await robot.stop()
+
+
+if __name__ == "__main__":
+ asyncio.run(run_piper_arm())
diff --git a/dimos/robot/agilex/run.py b/dimos/robot/agilex/run.py
new file mode 100644
index 0000000000..c9e5a036d8
--- /dev/null
+++ b/dimos/robot/agilex/run.py
@@ -0,0 +1,196 @@
+#!/usr/bin/env python3
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Run script for Piper Arm robot with Claude agent integration.
+Provides manipulation capabilities with natural language interface.
+"""
+
+import asyncio
+import os
+import sys
+import time
+from dotenv import load_dotenv
+
+import reactivex as rx
+import reactivex.operators as ops
+
+from dimos.robot.agilex.piper_arm import PiperArmRobot
+from dimos.agents.claude_agent import ClaudeAgent
+from dimos.skills.manipulation.pick_and_place import PickAndPlace
+from dimos.skills.kill_skill import KillSkill
+from dimos.skills.observe import Observe
+from dimos.web.robot_web_interface import RobotWebInterface
+from dimos.stream.audio.pipelines import stt, tts
+from dimos.utils.logging_config import setup_logger
+
+logger = setup_logger("dimos.robot.agilex.run")
+
+# Load environment variables
+load_dotenv()
+
+# System prompt for the Piper Arm manipulation agent
+SYSTEM_PROMPT = """You are an intelligent robotic assistant controlling a Piper Arm robot with advanced manipulation capabilities. Your primary role is to help users with pick and place tasks using natural language understanding.
+
+## Your Capabilities:
+1. **Visual Perception**: You have access to a ZED stereo camera that provides RGB and depth information
+2. **Object Manipulation**: You can pick up and place objects using a 6-DOF robotic arm with a gripper
+3. **Language Understanding**: You use the Qwen vision-language model to identify objects and locations from natural language descriptions
+
+## Available Skills:
+- **PickAndPlace**: Execute pick and place operations based on object and location descriptions
+ - Pick only: "Pick up the red mug"
+ - Pick and place: "Move the book to the shelf"
+- **Observe**: Capture and analyze the current camera view
+- **KillSkill**: Stop any currently running skill
+
+## Guidelines:
+1. **Safety First**: Always ensure safe operation. If unsure about an object's graspability or a placement location's stability, ask for clarification
+2. **Clear Communication**: Explain what you're doing and ask for confirmation when needed
+3. **Error Handling**: If a task fails, explain why and suggest alternatives
+4. **Precision**: When users give specific object descriptions, use them exactly as provided to the vision model
+
+## Interaction Examples:
+- User: "Pick up the coffee mug"
+ You: "I'll pick up the coffee mug for you." [Execute PickAndPlace with object_query="coffee mug"]
+
+- User: "Put the toy on the table"
+ You: "I'll place the toy on the table." [Execute PickAndPlace with object_query="toy", target_query="on the table"]
+
+- User: "What do you see?"
+ You: "Let me take a look at the current view." [Execute Observe]
+
+Remember: You're here to assist with manipulation tasks. Be helpful, precise, and always prioritize safe operation of the robot."""
+
+
+def main():
+ """Main entry point."""
+ print("\n" + "=" * 60)
+ print("Piper Arm Robot with Claude Agent")
+ print("=" * 60)
+ print("\nThis system integrates:")
+ print(" - Piper Arm 6-DOF robot")
+ print(" - ZED stereo camera")
+ print(" - Claude AI for natural language understanding")
+ print(" - Qwen VLM for visual object detection")
+ print(" - Web interface with text and voice input")
+ print(" - Foxglove visualization via LCM")
+ print("\nStarting system...\n")
+
+ # Check for API key
+ if not os.getenv("ANTHROPIC_API_KEY"):
+ print("WARNING: ANTHROPIC_API_KEY not found in environment")
+ print("Please set your API key in .env file or environment")
+ sys.exit(1)
+
+ logger.info("Starting Piper Arm Robot with Agent")
+
+ # Create robot instance
+ robot = PiperArmRobot()
+
+ try:
+ # Start the robot (this is async, so we need asyncio.run)
+ logger.info("Initializing robot...")
+ asyncio.run(robot.start())
+ logger.info("Robot initialized successfully")
+
+ # Set up skill library
+ skills = robot.get_skills()
+ skills.add(PickAndPlace)
+ skills.add(Observe)
+ skills.add(KillSkill)
+
+ # Create skill instances
+ skills.create_instance("PickAndPlace", robot=robot)
+ skills.create_instance("Observe", robot=robot)
+ skills.create_instance("KillSkill", robot=robot, skill_library=skills)
+
+ logger.info(f"Skills registered: {[skill.__name__ for skill in skills.get_class_skills()]}")
+
+ # Set up streams for agent and web interface
+ agent_response_subject = rx.subject.Subject()
+ agent_response_stream = agent_response_subject.pipe(ops.share())
+ audio_subject = rx.subject.Subject()
+
+ # Set up streams for web interface
+ streams = {}
+
+ text_streams = {
+ "agent_responses": agent_response_stream,
+ }
+
+ # Create web interface first (needed for agent)
+ try:
+ web_interface = RobotWebInterface(
+ port=5555, text_streams=text_streams, audio_subject=audio_subject, **streams
+ )
+ logger.info("Web interface created successfully")
+ except Exception as e:
+ logger.error(f"Failed to create web interface: {e}")
+ raise
+
+ # Set up speech-to-text
+ stt_node = stt()
+ stt_node.consume_audio(audio_subject.pipe(ops.share()))
+
+ # Create Claude agent
+ agent = ClaudeAgent(
+ dev_name="piper_arm_agent",
+ input_query_stream=web_interface.query_stream, # Use text input from web interface
+ # input_query_stream=stt_node.emit_text(), # Uncomment to use voice input
+ skills=skills,
+ system_query=SYSTEM_PROMPT,
+ model_name="claude-3-5-haiku-latest",
+ thinking_budget_tokens=0,
+ max_output_tokens_per_request=4096,
+ )
+
+ # Subscribe to agent responses
+ agent.get_response_observable().subscribe(lambda x: agent_response_subject.on_next(x))
+
+ # Set up text-to-speech for agent responses
+ tts_node = tts()
+ tts_node.consume_text(agent.get_response_observable())
+
+ logger.info("=" * 60)
+ logger.info("Piper Arm Agent Ready!")
+ logger.info(f"Web interface available at: http://localhost:5555")
+ logger.info("Foxglove visualization available at: ws://localhost:8765")
+ logger.info("You can:")
+ logger.info(" - Type commands in the web interface")
+ logger.info(" - Use voice commands")
+ logger.info(" - Ask the robot to pick up objects")
+ logger.info(" - Ask the robot to move objects to locations")
+ logger.info("=" * 60)
+
+ # Run web interface (this blocks)
+ web_interface.run()
+
+ except KeyboardInterrupt:
+ logger.info("Keyboard interrupt received")
+ except Exception as e:
+ logger.error(f"Error running robot: {e}")
+ import traceback
+
+ traceback.print_exc()
+ finally:
+ logger.info("Shutting down...")
+ # Stop the robot (this is also async)
+ robot.stop()
+ logger.info("Robot stopped")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/dimos/robot/frontier_exploration/__init__.py b/dimos/robot/frontier_exploration/__init__.py
index 2b69011a9f..8b13789179 100644
--- a/dimos/robot/frontier_exploration/__init__.py
+++ b/dimos/robot/frontier_exploration/__init__.py
@@ -1 +1 @@
-from utils import *
+
diff --git a/dimos/robot/frontier_exploration/test_wavefront_frontier_goal_selector.py b/dimos/robot/frontier_exploration/test_wavefront_frontier_goal_selector.py
index c9b75b28d8..cd344dd0b4 100644
--- a/dimos/robot/frontier_exploration/test_wavefront_frontier_goal_selector.py
+++ b/dimos/robot/frontier_exploration/test_wavefront_frontier_goal_selector.py
@@ -25,7 +25,7 @@
)
from dimos.robot.unitree_webrtc.type.lidar import LidarMessage
from dimos.robot.unitree_webrtc.type.map import Map
-from dimos.types.vector import Vector
+from dimos.msgs.geometry_msgs import Vector3 as Vector
from dimos.utils.testing import SensorReplay
@@ -130,13 +130,6 @@ def test_exploration_goal_selection():
assert isinstance(goal, Vector), "Goal should be a Vector"
print(f"Selected exploration goal: ({goal.x:.2f}, {goal.y:.2f})")
- # Verify goal is at reasonable distance from robot
- distance = np.sqrt((goal.x - robot_pose.x) ** 2 + (goal.y - robot_pose.y) ** 2)
- print(f"Goal distance from robot: {distance:.2f}m")
- assert distance >= explorer.min_distance_from_robot, (
- "Goal should respect minimum distance from robot"
- )
-
# Test that goal gets marked as explored
assert len(explorer.explored_goals) == 1, "Goal should be marked as explored"
assert explorer.explored_goals[0] == goal, "Explored goal should match selected goal"
diff --git a/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py b/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py
index 1aca32fc93..454a70e803 100644
--- a/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py
+++ b/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py
@@ -82,11 +82,9 @@ class WavefrontFrontierExplorer:
def __init__(
self,
- min_frontier_size: int = 10,
+ min_frontier_size: int = 8,
occupancy_threshold: int = 65,
- subsample_resolution: int = 2,
- min_distance_from_robot: float = 0.5,
- explored_area_buffer: float = 0.5,
+ subsample_resolution: int = 3,
min_distance_from_obstacles: float = 0.6,
info_gain_threshold: float = 0.03,
num_no_gain_attempts: int = 4,
@@ -101,8 +99,6 @@ def __init__(
min_frontier_size: Minimum number of points to consider a valid frontier
occupancy_threshold: Cost threshold above which a cell is considered occupied (0-255)
subsample_resolution: Factor by which to subsample the costmap for faster processing (1=no subsampling, 2=half resolution, 4=quarter resolution)
- min_distance_from_robot: Minimum distance frontier must be from robot (meters)
- explored_area_buffer: Buffer distance around free areas to consider as explored (meters)
min_distance_from_obstacles: Minimum distance frontier must be from obstacles (meters)
info_gain_threshold: Minimum percentage increase in costmap information required to continue exploration (0.05 = 5%)
num_no_gain_attempts: Maximum number of consecutive attempts with no information gain
@@ -113,8 +109,6 @@ def __init__(
self.min_frontier_size = min_frontier_size
self.occupancy_threshold = occupancy_threshold
self.subsample_resolution = subsample_resolution
- self.min_distance_from_robot = min_distance_from_robot
- self.explored_area_buffer = explored_area_buffer
self.min_distance_from_obstacles = min_distance_from_obstacles
self.info_gain_threshold = info_gain_threshold
self.num_no_gain_attempts = num_no_gain_attempts
@@ -513,15 +507,6 @@ def _rank_frontiers(
valid_frontiers = []
for i, frontier in enumerate(frontier_centroids):
- robot_distance = np.sqrt(
- (frontier.x - robot_pose.x) ** 2 + (frontier.y - robot_pose.y) ** 2
- )
-
- # Filter 1: Skip frontiers too close to robot
- if robot_distance < self.min_distance_from_robot:
- continue
-
- # Filter 2: Skip frontiers too close to obstacles
obstacle_distance = self._compute_distance_to_obstacles(frontier, costmap)
if obstacle_distance < self.min_distance_from_obstacles:
continue
diff --git a/dimos/robot/unitree_webrtc/connection.py b/dimos/robot/unitree_webrtc/connection.py
index 9bc1874cbe..6119cba860 100644
--- a/dimos/robot/unitree_webrtc/connection.py
+++ b/dimos/robot/unitree_webrtc/connection.py
@@ -239,7 +239,7 @@ async def accept_track(track: MediaStreamTrack) -> VideoMessage:
if stop_event.is_set():
return
frame = await track.recv()
- subject.on_next(Image.from_numpy(frame.to_ndarray(format="bgr24")))
+ subject.on_next(Image.from_numpy(frame.to_ndarray(format="rgb24")))
self.conn.video.add_track_callback(accept_track)
diff --git a/dimos/robot/unitree_webrtc/multiprocess/unitree_go2.py b/dimos/robot/unitree_webrtc/multiprocess/unitree_go2.py
index f2b701fc63..0532be8320 100644
--- a/dimos/robot/unitree_webrtc/multiprocess/unitree_go2.py
+++ b/dimos/robot/unitree_webrtc/multiprocess/unitree_go2.py
@@ -94,7 +94,9 @@ def odom_stream(self):
@functools.cache
def video_stream(self):
print("video stream start")
- video_store = TimedSensorReplay("unitree_office_walk/video", autocast=Image.from_numpy)
+ video_store = TimedSensorReplay(
+ "unitree_office_walk/video", autocast=lambda x: Image.from_numpy(x).to_rgb()
+ )
return video_store.stream()
def move(self, vector: Vector):
diff --git a/dimos/skills/manipulation/pick_and_place.py b/dimos/skills/manipulation/pick_and_place.py
new file mode 100644
index 0000000000..4306975d8d
--- /dev/null
+++ b/dimos/skills/manipulation/pick_and_place.py
@@ -0,0 +1,439 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Pick and place skill for Piper Arm robot.
+
+This module provides a skill that uses Qwen VLM to identify pick and place
+locations based on natural language queries, then executes the manipulation.
+"""
+
+import json
+import cv2
+import os
+from typing import Optional, Tuple, Dict, Any
+import numpy as np
+from pydantic import Field
+
+from dimos.skills.manipulation.abstract_manipulation_skill import AbstractManipulationSkill
+from dimos.models.qwen.video_query import query_single_frame
+from dimos.utils.logging_config import setup_logger
+
+logger = setup_logger("dimos.skills.manipulation.pick_and_place")
+
+
+def parse_qwen_points_response(response: str) -> Optional[Tuple[Tuple[int, int], Tuple[int, int]]]:
+ """
+ Parse Qwen's response containing two points.
+
+ Args:
+ response: Qwen's response containing JSON with two points
+
+ Returns:
+ Tuple of (pick_point, place_point) where each point is (x, y), or None if parsing fails
+ """
+ try:
+ # Try to extract JSON from the response
+ start_idx = response.find("{")
+ end_idx = response.rfind("}") + 1
+
+ if start_idx >= 0 and end_idx > start_idx:
+ json_str = response[start_idx:end_idx]
+ result = json.loads(json_str)
+
+ # Extract pick and place points
+ if "pick_point" in result and "place_point" in result:
+ pick = result["pick_point"]
+ place = result["place_point"]
+
+ # Validate points have x,y coordinates
+ if (
+ isinstance(pick, (list, tuple))
+ and len(pick) >= 2
+ and isinstance(place, (list, tuple))
+ and len(place) >= 2
+ ):
+ return (int(pick[0]), int(pick[1])), (int(place[0]), int(place[1]))
+
+ except Exception as e:
+ logger.error(f"Error parsing Qwen points response: {e}")
+ logger.debug(f"Raw response: {response}")
+
+ return None
+
+
+def save_debug_image_with_points(
+ image: np.ndarray,
+ pick_point: Optional[Tuple[int, int]] = None,
+ place_point: Optional[Tuple[int, int]] = None,
+ filename_prefix: str = "qwen_debug",
+) -> str:
+ """
+ Save debug image with crosshairs marking pick and/or place points.
+
+ Args:
+ image: RGB image array
+ pick_point: (x, y) coordinates for pick location
+ place_point: (x, y) coordinates for place location
+ filename_prefix: Prefix for the saved filename
+
+ Returns:
+ Path to the saved image
+ """
+ # Create a copy to avoid modifying original
+ debug_image = image.copy()
+
+ # Draw pick point crosshair (green)
+ if pick_point:
+ x, y = pick_point
+ # Draw crosshair
+ cv2.drawMarker(debug_image, (x, y), (0, 255, 0), cv2.MARKER_CROSS, 30, 2)
+ # Draw circle
+ cv2.circle(debug_image, (x, y), 5, (0, 255, 0), -1)
+ # Add label
+ cv2.putText(
+ debug_image, "PICK", (x + 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2
+ )
+
+ # Draw place point crosshair (cyan)
+ if place_point:
+ x, y = place_point
+ # Draw crosshair
+ cv2.drawMarker(debug_image, (x, y), (255, 255, 0), cv2.MARKER_CROSS, 30, 2)
+ # Draw circle
+ cv2.circle(debug_image, (x, y), 5, (255, 255, 0), -1)
+ # Add label
+ cv2.putText(
+ debug_image, "PLACE", (x + 10, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 0), 2
+ )
+
+ # Draw arrow from pick to place if both exist
+ if pick_point and place_point:
+ cv2.arrowedLine(debug_image, pick_point, place_point, (255, 0, 255), 2, tipLength=0.03)
+
+ # Generate filename with timestamp
+ filename = f"{filename_prefix}.png"
+ filepath = os.path.join(os.getcwd(), filename)
+
+ # Save image
+ cv2.imwrite(filepath, debug_image)
+ logger.info(f"Debug image saved to: {filepath}")
+
+ return filepath
+
+
+def parse_qwen_single_point_response(response: str) -> Optional[Tuple[int, int]]:
+ """
+ Parse Qwen's response containing a single point.
+
+ Args:
+ response: Qwen's response containing JSON with a point
+
+ Returns:
+ Tuple of (x, y) or None if parsing fails
+ """
+ try:
+ # Try to extract JSON from the response
+ start_idx = response.find("{")
+ end_idx = response.rfind("}") + 1
+
+ if start_idx >= 0 and end_idx > start_idx:
+ json_str = response[start_idx:end_idx]
+ result = json.loads(json_str)
+
+ # Try different possible keys
+ point = None
+ for key in ["point", "location", "position", "coordinates"]:
+ if key in result:
+ point = result[key]
+ break
+
+ # Validate point has x,y coordinates
+ if point and isinstance(point, (list, tuple)) and len(point) >= 2:
+ return int(point[0]), int(point[1])
+
+ except Exception as e:
+ logger.error(f"Error parsing Qwen single point response: {e}")
+ logger.debug(f"Raw response: {response}")
+
+ return None
+
+
+class PickAndPlace(AbstractManipulationSkill):
+ """
+ A skill that performs pick and place operations using vision-language guidance.
+
+ This skill uses Qwen VLM to identify objects and locations based on natural
+ language queries, then executes pick and place operations using the robot's
+ manipulation interface.
+
+ Example usage:
+ # Just pick the object
+ skill = PickAndPlace(robot=robot, object_query="red mug")
+
+ # Pick and place the object
+ skill = PickAndPlace(robot=robot, object_query="red mug", target_query="on the coaster")
+
+ The skill uses the robot's stereo camera to capture RGB images and its manipulation
+ interface to execute the pick and place operation. It automatically handles coordinate
+ transformation from 2D pixel coordinates to 3D world coordinates.
+ """
+
+ object_query: str = Field(
+ "mug",
+ description="Natural language description of the object to pick (e.g., 'red mug', 'small box')",
+ )
+
+ target_query: Optional[str] = Field(
+ None,
+ description="Natural language description of where to place the object (e.g., 'on the table', 'in the basket'). If not provided, only pick operation will be performed.",
+ )
+
+ model_name: str = Field(
+ "qwen2.5-vl-72b-instruct", description="Qwen model to use for visual queries"
+ )
+
+ def __init__(self, robot=None, **data):
+ """
+ Initialize the PickAndPlace skill.
+
+ Args:
+ robot: The PiperArmRobot instance
+ **data: Additional configuration data
+ """
+ super().__init__(robot=robot, **data)
+
+ def _get_camera_frame(self) -> Optional[np.ndarray]:
+ """
+ Get a single RGB frame from the robot's camera.
+
+ Returns:
+ RGB image as numpy array or None if capture fails
+ """
+ if not self._robot or not self._robot.manipulation_interface:
+ logger.error("Robot or stereo camera not available")
+ return None
+
+ try:
+ # Use the RPC call to get a single RGB frame
+ rgb_frame = self._robot.manipulation_interface.get_single_rgb_frame()
+ if rgb_frame is None:
+ logger.error("Failed to capture RGB frame from camera")
+ return rgb_frame
+ except Exception as e:
+ logger.error(f"Error getting camera frame: {e}")
+ return None
+
+ def _query_pick_and_place_points(
+ self, frame: np.ndarray
+ ) -> Optional[Tuple[Tuple[int, int], Tuple[int, int]]]:
+ """
+ Query Qwen to get both pick and place points in a single query.
+
+ Args:
+ frame: RGB image array
+
+ Returns:
+ Tuple of (pick_point, place_point) or None if query fails
+ """
+ # This method is only called when both object and target are specified
+ prompt = (
+ f"Look at this image carefully. I need you to identify two specific locations:\n"
+ f"1. Find the {self.object_query} - this is the object I want to pick up\n"
+ f"2. Identify where to place it {self.target_query}\n\n"
+ "Instructions:\n"
+ "- The pick_point should be at the center or graspable part of the object\n"
+ "- The place_point should be a stable, flat surface at the target location\n"
+ "- Consider the object's size when choosing the placement point\n\n"
+ "Return ONLY a JSON object with this exact format:\n"
+ "{'pick_point': [x, y], 'place_point': [x, y]}\n"
+ "where [x, y] are pixel coordinates in the image."
+ )
+
+ try:
+ response = query_single_frame(frame, prompt, model_name=self.model_name)
+ return parse_qwen_points_response(response)
+ except Exception as e:
+ logger.error(f"Error querying Qwen for pick and place points: {e}")
+ return None
+
+ def _query_single_point(
+ self, frame: np.ndarray, query: str, point_type: str
+ ) -> Optional[Tuple[int, int]]:
+ """
+ Query Qwen to get a single point location.
+
+ Args:
+ frame: RGB image array
+ query: Natural language description of what to find
+ point_type: Type of point ('pick' or 'place') for context
+
+ Returns:
+ Tuple of (x, y) pixel coordinates or None if query fails
+ """
+ if point_type == "pick":
+ prompt = (
+ f"Look at this image carefully and find the {query}.\n\n"
+ "Instructions:\n"
+ "- Identify the exact object matching the description\n"
+ "- Choose the center point or the most graspable location on the object\n"
+ "- If multiple matching objects exist, choose the most prominent or accessible one\n"
+ "- Consider the object's shape and material when selecting the grasp point\n\n"
+ "Return ONLY a JSON object with this exact format:\n"
+ "{'point': [x, y]}\n"
+ "where [x, y] are the pixel coordinates of the optimal grasping point on the object."
+ )
+ else: # place
+ prompt = (
+ f"Look at this image and identify where to place an object {query}.\n\n"
+ "Instructions:\n"
+ "- Find a stable, flat surface at the specified location\n"
+ "- Ensure the placement spot is clear of obstacles\n"
+ "- Consider the size of the object being placed\n"
+ "- If the query specifies a container or specific spot, center the placement there\n"
+ "- Otherwise, find the most appropriate nearby surface\n\n"
+ "Return ONLY a JSON object with this exact format:\n"
+ "{'point': [x, y]}\n"
+ "where [x, y] are the pixel coordinates of the optimal placement location."
+ )
+
+ try:
+ response = query_single_frame(frame, prompt, model_name=self.model_name)
+ return parse_qwen_single_point_response(response)
+ except Exception as e:
+ logger.error(f"Error querying Qwen for {point_type} point: {e}")
+ return None
+
+ def __call__(self) -> Dict[str, Any]:
+ """
+ Execute the pick and place operation.
+
+ Returns:
+ Dictionary with operation results
+ """
+ super().__call__()
+
+ if not self._robot:
+ error_msg = "No robot instance provided to PickAndPlace skill"
+ logger.error(error_msg)
+ return {"success": False, "error": error_msg}
+
+ # Register skill as running
+ skill_library = self._robot.get_skills()
+ self.register_as_running("PickAndPlace", skill_library)
+
+ # Get camera frame
+ frame = self._get_camera_frame()
+ if frame is None:
+ return {"success": False, "error": "Failed to capture camera frame"}
+
+ # Convert RGB to BGR for OpenCV if needed
+ if len(frame.shape) == 3 and frame.shape[2] == 3:
+ frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+
+ # Get pick and place points from Qwen
+ pick_point = None
+ place_point = None
+
+ # Determine mode based on whether target_query is provided
+ if self.target_query is None:
+ # Pick only mode
+ logger.info("Pick-only mode (no target specified)")
+
+ # Query for pick point
+ pick_point = self._query_single_point(frame, self.object_query, "pick")
+ if not pick_point:
+ return {"success": False, "error": f"Failed to find {self.object_query}"}
+
+ # No place point needed for pick-only
+ place_point = None
+ else:
+ # Pick and place mode - can use either single or dual query
+ logger.info("Pick and place mode (target specified)")
+
+ # Try single query first for efficiency
+ points = self._query_pick_and_place_points(frame)
+ pick_point, place_point = points
+
+ logger.info(f"Pick point: {pick_point}, Place point: {place_point}")
+
+ # Save debug image with marked points
+ if pick_point or place_point:
+ save_debug_image_with_points(frame, pick_point, place_point)
+
+ # Execute pick (and optionally place) using the robot's interface
+ try:
+ if place_point:
+ # Pick and place
+ result = self._robot.pick_and_place(
+ pick_x=pick_point[0],
+ pick_y=pick_point[1],
+ place_x=place_point[0],
+ place_y=place_point[1],
+ )
+ else:
+ # Pick only
+ result = self._robot.pick_and_place(
+ pick_x=pick_point[0], pick_y=pick_point[1], place_x=None, place_y=None
+ )
+
+ if result:
+ if self.target_query:
+ message = (
+ f"Successfully picked {self.object_query} and placed it {self.target_query}"
+ )
+ else:
+ message = f"Successfully picked {self.object_query}"
+
+ return {
+ "success": True,
+ "pick_point": pick_point,
+ "place_point": place_point,
+ "object": self.object_query,
+ "target": self.target_query,
+ "message": message,
+ }
+ else:
+ operation = "Pick and place" if self.target_query else "Pick"
+ return {
+ "success": False,
+ "pick_point": pick_point,
+ "place_point": place_point,
+ "error": f"{operation} operation failed",
+ }
+
+ except Exception as e:
+ logger.error(f"Error executing pick and place: {e}")
+ return {
+ "success": False,
+ "error": f"Execution error: {str(e)}",
+ "pick_point": pick_point,
+ "place_point": place_point,
+ }
+ finally:
+ # Always unregister skill when done
+ self.stop()
+
+ def stop(self) -> None:
+ """
+ Stop the pick and place operation and perform cleanup.
+ """
+ logger.info("Stopping PickAndPlace skill")
+
+ # Unregister skill from skill library
+ if self._robot:
+ skill_library = self._robot.get_skills()
+ self.unregister_as_running("PickAndPlace", skill_library)
+
+ logger.info("PickAndPlace skill stopped successfully")
diff --git a/dimos/utils/transform_utils.py b/dimos/utils/transform_utils.py
index 31d3840884..eaedbcecf3 100644
--- a/dimos/utils/transform_utils.py
+++ b/dimos/utils/transform_utils.py
@@ -13,10 +13,11 @@
# limitations under the License.
import numpy as np
-from typing import Tuple, Dict, Any
+from typing import Tuple
import logging
+from scipy.spatial.transform import Rotation as R
-from dimos.types.vector import Vector
+from dimos_lcm.geometry_msgs import Pose, Point, Vector3, Quaternion
logger = logging.getLogger(__name__)
@@ -31,25 +32,216 @@ def distance_angle_to_goal_xy(distance: float, angle: float) -> Tuple[float, flo
return distance * np.cos(angle), distance * np.sin(angle)
+def pose_to_matrix(pose: Pose) -> np.ndarray:
+ """
+ Convert pose to 4x4 homogeneous transform matrix.
+
+ Args:
+ pose: Pose object with position and orientation (quaternion)
+
+ Returns:
+ 4x4 transformation matrix
+ """
+ # Extract position
+ tx, ty, tz = pose.position.x, pose.position.y, pose.position.z
+
+ # Create rotation matrix from quaternion using scipy
+ quat = [pose.orientation.x, pose.orientation.y, pose.orientation.z, pose.orientation.w]
+
+ # Check for zero norm quaternion and use identity if invalid
+ quat_norm = np.linalg.norm(quat)
+ if quat_norm == 0.0:
+ # Use identity quaternion [0, 0, 0, 1] if zero norm detected
+ quat = [0.0, 0.0, 0.0, 1.0]
+
+ rotation = R.from_quat(quat)
+ Rot = rotation.as_matrix()
+
+ # Create 4x4 transform
+ T = np.eye(4)
+ T[:3, :3] = Rot
+ T[:3, 3] = [tx, ty, tz]
+
+ return T
+
+
+def matrix_to_pose(T: np.ndarray) -> Pose:
+ """
+ Convert 4x4 transformation matrix to Pose object.
+
+ Args:
+ T: 4x4 transformation matrix
+
+ Returns:
+ Pose object with position and orientation (quaternion)
+ """
+ # Extract position
+ pos = Point(T[0, 3], T[1, 3], T[2, 3])
+
+ # Extract rotation matrix and convert to quaternion
+ Rot = T[:3, :3]
+ rotation = R.from_matrix(Rot)
+ quat = rotation.as_quat() # Returns [x, y, z, w]
+
+ orientation = Quaternion(quat[0], quat[1], quat[2], quat[3])
+
+ return Pose(pos, orientation)
+
+
+def apply_transform(pose: Pose, transform_matrix: np.ndarray) -> Pose:
+ """
+ Apply a transformation matrix to a pose.
+
+ Args:
+ pose: Input pose
+ transform_matrix: 4x4 transformation matrix to apply
+
+ Returns:
+ Transformed pose
+ """
+ # Convert pose to matrix
+ T_pose = pose_to_matrix(pose)
+
+ # Apply transform
+ T_result = transform_matrix @ T_pose
+
+ # Convert back to pose
+ return matrix_to_pose(T_result)
+
+
+def optical_to_robot_frame(pose: Pose) -> Pose:
+ """
+ Convert pose from optical camera frame to robot frame convention.
+
+ Optical Camera Frame (e.g., ZED):
+ - X: Right
+ - Y: Down
+ - Z: Forward (away from camera)
+
+ Robot Frame (ROS/REP-103):
+ - X: Forward
+ - Y: Left
+ - Z: Up
+
+ Args:
+ pose: Pose in optical camera frame
+
+ Returns:
+ Pose in robot frame
+ """
+ # Position transformation
+ robot_x = pose.position.z # Forward = Camera Z
+ robot_y = -pose.position.x # Left = -Camera X
+ robot_z = -pose.position.y # Up = -Camera Y
+
+ # Rotation transformation using quaternions
+ # First convert quaternion to rotation matrix
+ quat_optical = [pose.orientation.x, pose.orientation.y, pose.orientation.z, pose.orientation.w]
+ R_optical = R.from_quat(quat_optical).as_matrix()
+
+ # Coordinate frame transformation matrix from optical to robot
+ # X_robot = Z_optical, Y_robot = -X_optical, Z_robot = -Y_optical
+ T_frame = np.array(
+ [
+ [0, 0, 1], # X_robot = Z_optical
+ [-1, 0, 0], # Y_robot = -X_optical
+ [0, -1, 0], # Z_robot = -Y_optical
+ ]
+ )
+
+ # Transform the rotation matrix
+ R_robot = T_frame @ R_optical @ T_frame.T
+
+ # Convert back to quaternion
+ quat_robot = R.from_matrix(R_robot).as_quat() # [x, y, z, w]
+
+ return Pose(
+ Point(robot_x, robot_y, robot_z),
+ Quaternion(quat_robot[0], quat_robot[1], quat_robot[2], quat_robot[3]),
+ )
+
+
+def robot_to_optical_frame(pose: Pose) -> Pose:
+ """
+ Convert pose from robot frame to optical camera frame convention.
+ This is the inverse of optical_to_robot_frame.
+
+ Args:
+ pose: Pose in robot frame
+
+ Returns:
+ Pose in optical camera frame
+ """
+ # Position transformation (inverse)
+ optical_x = -pose.position.y # Right = -Left
+ optical_y = -pose.position.z # Down = -Up
+ optical_z = pose.position.x # Forward = Forward
+
+ # Rotation transformation using quaternions
+ quat_robot = [pose.orientation.x, pose.orientation.y, pose.orientation.z, pose.orientation.w]
+ R_robot = R.from_quat(quat_robot).as_matrix()
+
+ # Coordinate frame transformation matrix from Robot to optical (inverse of optical to Robot)
+ # This is the transpose of the forward transformation
+ T_frame_inv = np.array(
+ [
+ [0, -1, 0], # X_optical = -Y_robot
+ [0, 0, -1], # Y_optical = -Z_robot
+ [1, 0, 0], # Z_optical = X_robot
+ ]
+ )
+
+ # Transform the rotation matrix
+ R_optical = T_frame_inv @ R_robot @ T_frame_inv.T
+
+ # Convert back to quaternion
+ quat_optical = R.from_matrix(R_optical).as_quat() # [x, y, z, w]
+
+ return Pose(
+ Point(optical_x, optical_y, optical_z),
+ Quaternion(quat_optical[0], quat_optical[1], quat_optical[2], quat_optical[3]),
+ )
+
+
+def yaw_towards_point(position: Point, target_point: Point = None) -> float:
+ """
+ Calculate yaw angle from target point to position (away from target).
+ This is commonly used for object orientation in grasping applications.
+ Assumes robot frame where X is forward and Y is left.
+
+ Args:
+ position: Current position in robot frame
+ target_point: Reference point (default: origin)
+
+ Returns:
+ Yaw angle in radians pointing from target_point to position
+ """
+ if target_point is None:
+ target_point = Point(0.0, 0.0, 0.0)
+ direction_x = position.x - target_point.x
+ direction_y = position.y - target_point.y
+ return np.arctan2(direction_y, direction_x)
+
+
def transform_robot_to_map(
- robot_position: Vector, robot_rotation: Vector, position: Vector, rotation: Vector
-) -> Tuple[Vector, Vector]:
+ robot_position: Point, robot_rotation: Vector3, position: Point, rotation: Vector3
+) -> Tuple[Point, Vector3]:
"""Transform position and rotation from robot frame to map frame.
Args:
robot_position: Current robot position in map frame
robot_rotation: Current robot rotation in map frame
- position: Position in robot frame as Vector (x, y, z)
- rotation: Rotation in robot frame as Vector (roll, pitch, yaw) in radians
+ position: Position in robot frame as Point (x, y, z)
+ rotation: Rotation in robot frame as Vector3 (roll, pitch, yaw) in radians
Returns:
Tuple of (transformed_position, transformed_rotation) where:
- - transformed_position: Vector (x, y, z) in map frame
- - transformed_rotation: Vector (roll, pitch, yaw) in map frame
+ - transformed_position: Point (x, y, z) in map frame
+ - transformed_rotation: Vector3 (roll, pitch, yaw) in map frame
Example:
- obj_pos_robot = Vector(1.0, 0.5, 0.0) # 1m forward, 0.5m left of robot
- obj_rot_robot = Vector(0.0, 0.0, 0.0) # No rotation relative to robot
+ obj_pos_robot = Point(1.0, 0.5, 0.0) # 1m forward, 0.5m left of robot
+ obj_rot_robot = Vector3(0.0, 0.0, 0.0) # No rotation relative to robot
map_pos, map_rot = transform_robot_to_map(robot_position, robot_rotation, obj_pos_robot, obj_rot_robot)
"""
@@ -79,7 +271,128 @@ def transform_robot_to_map(
map_pitch = robot_rot.y + rot_pitch # Add robot's pitch
map_yaw_rot = normalize_angle(robot_yaw + rot_yaw) # Add robot's yaw and normalize
- transformed_position = Vector(map_x, map_y, map_z)
- transformed_rotation = Vector(map_roll, map_pitch, map_yaw_rot)
+ transformed_position = Point(map_x, map_y, map_z)
+ transformed_rotation = Vector3(map_roll, map_pitch, map_yaw_rot)
return transformed_position, transformed_rotation
+
+
+def create_transform_from_6dof(translation: Vector3, euler_angles: Vector3) -> np.ndarray:
+ """
+ Create a 4x4 transformation matrix from 6DOF parameters.
+
+ Args:
+ translation: Translation vector [x, y, z] in meters
+ euler_angles: Euler angles [rx, ry, rz] in radians (XYZ convention)
+
+ Returns:
+ 4x4 transformation matrix
+ """
+ # Create transformation matrix
+ T = np.eye(4)
+
+ # Set translation
+ T[0:3, 3] = [translation.x, translation.y, translation.z]
+
+ # Set rotation using scipy
+ if np.linalg.norm([euler_angles.x, euler_angles.y, euler_angles.z]) > 1e-6:
+ rotation = R.from_euler("xyz", [euler_angles.x, euler_angles.y, euler_angles.z])
+ T[0:3, 0:3] = rotation.as_matrix()
+
+ return T
+
+
+def invert_transform(T: np.ndarray) -> np.ndarray:
+ """
+ Invert a 4x4 transformation matrix efficiently.
+
+ Args:
+ T: 4x4 transformation matrix
+
+ Returns:
+ Inverted 4x4 transformation matrix
+ """
+ # For homogeneous transform matrices, we can use the special structure:
+ # [R t]^-1 = [R^T -R^T*t]
+ # [0 1] [0 1 ]
+
+ Rot = T[:3, :3]
+ t = T[:3, 3]
+
+ T_inv = np.eye(4)
+ T_inv[:3, :3] = Rot.T
+ T_inv[:3, 3] = -Rot.T @ t
+
+ return T_inv
+
+
+def compose_transforms(*transforms: np.ndarray) -> np.ndarray:
+ """
+ Compose multiple transformation matrices.
+
+ Args:
+ *transforms: Variable number of 4x4 transformation matrices
+
+ Returns:
+ Composed 4x4 transformation matrix (T1 @ T2 @ ... @ Tn)
+ """
+ result = np.eye(4)
+ for T in transforms:
+ result = result @ T
+ return result
+
+
+def euler_to_quaternion(euler_angles: Vector3, degrees: bool = False) -> Quaternion:
+ """
+ Convert euler angles to quaternion.
+
+ Args:
+ euler_angles: Euler angles as Vector3 [roll, pitch, yaw] in radians (XYZ convention)
+
+ Returns:
+ Quaternion object [x, y, z, w]
+ """
+ rotation = R.from_euler(
+ "xyz", [euler_angles.x, euler_angles.y, euler_angles.z], degrees=degrees
+ )
+ quat = rotation.as_quat() # Returns [x, y, z, w]
+ return Quaternion(quat[0], quat[1], quat[2], quat[3])
+
+
+def quaternion_to_euler(quaternion: Quaternion, degrees: bool = False) -> Vector3:
+ """
+ Convert quaternion to euler angles.
+
+ Args:
+ quaternion: Quaternion object [x, y, z, w]
+
+ Returns:
+ Euler angles as Vector3 [roll, pitch, yaw] in radians (XYZ convention)
+ """
+ quat = [quaternion.x, quaternion.y, quaternion.z, quaternion.w]
+ rotation = R.from_quat(quat)
+ euler = rotation.as_euler("xyz", degrees=degrees) # Returns [roll, pitch, yaw]
+ if not degrees:
+ return Vector3(
+ normalize_angle(euler[0]), normalize_angle(euler[1]), normalize_angle(euler[2])
+ )
+ else:
+ return Vector3(euler[0], euler[1], euler[2])
+
+
+def get_distance(pose1: Pose, pose2: Pose) -> float:
+ """
+ Calculate Euclidean distance between two poses.
+
+ Args:
+ pose1: First pose
+ pose2: Second pose
+
+ Returns:
+ Euclidean distance between the two poses in meters
+ """
+ dx = pose1.position.x - pose2.position.x
+ dy = pose1.position.y - pose2.position.y
+ dz = pose1.position.z - pose2.position.z
+
+ return np.linalg.norm(np.array([dx, dy, dz]))
diff --git a/docs/modules_CN.md b/docs/modules_CN.md
new file mode 100644
index 0000000000..d8f088ef59
--- /dev/null
+++ b/docs/modules_CN.md
@@ -0,0 +1,188 @@
+# Dimensional 模块系统
+
+DimOS 模块系统使用 Dask 进行计算分布和 LCM(轻量级通信和编组)进行高性能进程间通信,实现分布式、多进程的机器人应用。
+
+## 核心概念
+
+### 1. 模块定义
+模块是继承自 `dimos.core.Module` 的 Python 类,定义输入、输出和 RPC 方法:
+
+```python
+from dimos.core import Module, In, Out, rpc
+from dimos.msgs.geometry_msgs import Vector3
+
+class MyModule(Module): # ROS Node
+ # 将输入/输出声明为初始化为 None 的类属性
+ data_in: In[Vector3] = None # ROS Subscriber
+ data_out: Out[Vector3] = None # ROS Publisher
+
+ def __init__():
+ # 调用父类 Module 初始化
+ super().__init__()
+
+ @rpc
+ def remote_method(self, param):
+ """使用 @rpc 装饰的方法可以远程调用"""
+ return param * 2
+```
+
+### 2. 模块部署
+使用 `dimos.deploy()` 方法在 Dask 工作进程中部署模块:
+
+```python
+from dimos import core
+
+# 启动具有 N 个工作进程的 Dask 集群
+dimos = core.start(4)
+
+# 部署模块时可以传递初始化参数
+# 在这种情况下,param1 和 param2 被传递到模块初始化中
+module = dimos.deploy(Module, param1="value1", param2=123)
+```
+
+### 3. 流连接
+模块通过使用 LCM 传输的响应式流进行通信:
+
+```python
+# 为输出配置 LCM 传输
+module1.data_out.transport = core.LCMTransport("/topic_name", MessageType)
+
+# 将模块输入连接到输出
+module2.data_in.connect(module1.data_out)
+
+# 访问底层的 Observable 流
+stream = module1.data_out.observable()
+stream.subscribe(lambda msg: print(f"接收到: {msg}"))
+```
+
+### 4. 模块生命周期
+```python
+# 启动模块以开始处理
+module.start() # 如果定义了 @rpc start() 方法,则调用它
+
+# 检查模块 I/O 配置
+print(module.io().result()) # 显示输入、输出和 RPC 方法
+
+# 优雅关闭
+dimos.shutdown()
+```
+
+## 实际示例:机器人控制系统
+
+```python
+# 连接模块封装机器人硬件/仿真
+connection = dimos.deploy(ConnectionModule, ip=robot_ip)
+connection.lidar.transport = core.LCMTransport("/lidar", LidarMessage)
+connection.video.transport = core.LCMTransport("/video", Image)
+
+# 感知模块处理传感器数据
+perception = dimos.deploy(PersonTrackingStream, camera_intrinsics=[...])
+perception.video.connect(connection.video)
+perception.tracking_data.transport = core.pLCMTransport("/person_tracking")
+
+# 开始处理
+connection.start()
+perception.start()
+
+# 通过 RPC 启用跟踪
+perception.enable_tracking()
+
+# 获取最新的跟踪数据
+data = perception.get_tracking_data()
+```
+
+## LCM 传输配置
+
+```python
+# 用于简单类型(如激光雷达)的标准 LCM 传输
+connection.lidar.transport = core.LCMTransport("/lidar", LidarMessage)
+
+# 用于复杂 Python 对象/字典的基于 pickle 的传输
+connection.tracking_data.transport = core.pLCMTransport("/person_tracking")
+
+# 自动配置 LCM 系统缓冲区(在容器中必需)
+from dimos.protocol import pubsub
+pubsub.lcm.autoconf()
+```
+
+这种架构使得能够将复杂的机器人系统构建为可组合的分布式模块,这些模块通过流和 RPC 高效通信,从单机扩展到集群。
+
+# Dimensional 安装指南
+## Python 安装(Ubuntu 22.04)
+
+```bash
+sudo apt install python3-venv
+
+# 克隆仓库(dev 分支,无子模块)
+git clone -b dev https://github.com/dimensionalOS/dimos.git
+cd dimos
+
+# 创建并激活虚拟环境
+python3 -m venv venv
+source venv/bin/activate
+
+sudo apt install portaudio19-dev python3-pyaudio
+
+# 如果尚未安装,请安装 torch 和 torchvision
+# 示例 CUDA 11.7,Pytorch 2.0.1(如果需要不同的 pytorch 版本,请替换)
+pip install torch==2.0.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+```
+
+### 安装依赖
+```bash
+# 仅 CPU(建议首先尝试)
+pip install .[cpu,dev]
+
+# CUDA 安装
+pip install .[cuda,dev]
+
+# 复制并配置环境变量
+cp default.env .env
+```
+
+### 测试安装
+```bash
+# 运行标准测试
+pytest -s dimos/
+
+# 测试模块功能
+pytest -s -m module dimos/
+
+# 测试 LCM 通信
+pytest -s -m lcm dimos/
+```
+
+# Unitree Go2 快速开始
+
+要快速测试模块系统,您可以直接运行 Unitree Go2 多进程示例:
+
+```bash
+# 确保设置了所需的环境变量
+export ROBOT_IP=
+
+# 运行多进程 Unitree Go2 示例
+python dimos/robot/unitree_webrtc/multiprocess/unitree_go2.py
+```
+
+## 模块系统的高级特性
+
+### 分布式计算
+DimOS 模块系统建立在 Dask 之上,提供了强大的分布式计算能力:
+
+- **自动负载均衡**:模块自动分布在可用的工作进程中
+- **容错性**:如果工作进程失败,模块可以在其他工作进程上重新启动
+- **可扩展性**:从单机到集群的无缝扩展
+
+### 响应式编程模型
+使用 RxPY 实现的响应式流提供了:
+
+- **异步处理**:非阻塞的数据流处理
+- **背压处理**:自动管理快速生产者和慢速消费者
+- **操作符链**:使用 map、filter、merge 等操作符进行流转换
+
+### 性能优化
+LCM 传输针对机器人应用进行了优化:
+
+- **零拷贝**:大型消息的高效内存使用
+- **低延迟**:微秒级的消息传递
+- **多播支持**:一对多的高效通信
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 7d51aa91d4..436b9a5750 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -115,6 +115,9 @@ manipulation = [
"pyyaml>=6.0",
"contact-graspnet-pytorch @ git+https://github.com/dimensionalOS/contact_graspnet_pytorch.git",
+ # piper arm
+ "piper-sdk",
+
# Visualization (Optional)
"kaleido>=0.2.1",
"plotly>=5.9.0",
diff --git a/tests/test_manipulation_perception_pipeline.py b/tests/test_manipulation_perception_pipeline.py
index 8b333ec310..227f991650 100644
--- a/tests/test_manipulation_perception_pipeline.py
+++ b/tests/test_manipulation_perception_pipeline.py
@@ -36,7 +36,7 @@
from dimos.stream.stereo_camera_streams.zed import ZEDCameraStream
from dimos.web.robot_web_interface import RobotWebInterface
from dimos.utils.logging_config import logger
-from dimos.perception.manip_aio_pipeline import ManipulationPipeline
+from dimos.manipulation.manip_aio_pipeline import ManipulationPipeline
def monitor_grasps(pipeline):
diff --git a/tests/test_manipulation_perception_pipeline.py.py b/tests/test_manipulation_perception_pipeline.py.py
new file mode 100644
index 0000000000..227f991650
--- /dev/null
+++ b/tests/test_manipulation_perception_pipeline.py.py
@@ -0,0 +1,167 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import time
+import threading
+from reactivex import operators as ops
+
+import tests.test_header
+
+from pyzed import sl
+from dimos.stream.stereo_camera_streams.zed import ZEDCameraStream
+from dimos.web.robot_web_interface import RobotWebInterface
+from dimos.utils.logging_config import logger
+from dimos.manipulation.manip_aio_pipeline import ManipulationPipeline
+
+
+def monitor_grasps(pipeline):
+ """Monitor and print grasp updates in a separate thread."""
+ print(" Grasp monitor started...")
+
+ last_grasp_count = 0
+ last_update_time = time.time()
+
+ while True:
+ try:
+ # Get latest grasps using the getter function
+ grasps = pipeline.get_latest_grasps(timeout=0.5)
+ current_time = time.time()
+
+ if grasps is not None:
+ current_count = len(grasps)
+ if current_count != last_grasp_count:
+ print(f" Grasps received: {current_count} (at {time.strftime('%H:%M:%S')})")
+ if current_count > 0:
+ best_score = max(grasp.get("score", 0.0) for grasp in grasps)
+ print(f" Best grasp score: {best_score:.3f}")
+ last_grasp_count = current_count
+ last_update_time = current_time
+ else:
+ # Show periodic "still waiting" message
+ if current_time - last_update_time > 10.0:
+ print(f" Still waiting for grasps... ({time.strftime('%H:%M:%S')})")
+ last_update_time = current_time
+
+ time.sleep(1.0) # Check every second
+
+ except Exception as e:
+ print(f" Error in grasp monitor: {e}")
+ time.sleep(2.0)
+
+
+def main():
+ """Test point cloud filtering with grasp generation using ManipulationPipeline."""
+ print(" Testing point cloud filtering + grasp generation with ManipulationPipeline...")
+
+ # Configuration
+ min_confidence = 0.6
+ web_port = 5555
+ grasp_server_url = "ws://18.224.39.74:8000/ws/grasp"
+
+ try:
+ # Initialize ZED camera stream
+ zed_stream = ZEDCameraStream(resolution=sl.RESOLUTION.HD1080, fps=10)
+
+ # Get camera intrinsics
+ camera_intrinsics_dict = zed_stream.get_camera_info()
+ camera_intrinsics = [
+ camera_intrinsics_dict["fx"],
+ camera_intrinsics_dict["fy"],
+ camera_intrinsics_dict["cx"],
+ camera_intrinsics_dict["cy"],
+ ]
+
+ # Create the concurrent manipulation pipeline WITH grasp generation
+ pipeline = ManipulationPipeline(
+ camera_intrinsics=camera_intrinsics,
+ min_confidence=min_confidence,
+ max_objects=10,
+ grasp_server_url=grasp_server_url,
+ enable_grasp_generation=True, # Enable grasp generation
+ )
+
+ # Create ZED stream
+ zed_frame_stream = zed_stream.create_stream().pipe(ops.share())
+
+ # Create concurrent processing streams
+ streams = pipeline.create_streams(zed_frame_stream)
+ detection_viz_stream = streams["detection_viz"]
+ pointcloud_viz_stream = streams["pointcloud_viz"]
+ grasps_stream = streams.get("grasps") # Get grasp stream if available
+ grasp_overlay_stream = streams.get("grasp_overlay") # Get grasp overlay stream if available
+
+ except ImportError:
+ print("Error: ZED SDK not installed. Please install pyzed package.")
+ sys.exit(1)
+ except RuntimeError as e:
+ print(f"Error: Failed to open ZED camera: {e}")
+ sys.exit(1)
+
+ try:
+ # Set up web interface with concurrent visualization streams
+ print("Initializing web interface...")
+ web_interface = RobotWebInterface(
+ port=web_port,
+ object_detection=detection_viz_stream,
+ pointcloud_stream=pointcloud_viz_stream,
+ grasp_overlay_stream=grasp_overlay_stream,
+ )
+
+ # Start grasp monitoring in background thread
+ grasp_monitor_thread = threading.Thread(
+ target=monitor_grasps, args=(pipeline,), daemon=True
+ )
+ grasp_monitor_thread.start()
+
+ print(f"\n Point Cloud + Grasp Generation Test Running:")
+ print(f" Web Interface: http://localhost:{web_port}")
+ print(f" Object Detection View: RGB with bounding boxes")
+ print(f" Point Cloud View: Depth with colored point clouds and 3D bounding boxes")
+ print(f" Confidence threshold: {min_confidence}")
+ print(f" Grasp server: {grasp_server_url}")
+ print(f" Available streams: {list(streams.keys())}")
+ print("\nPress Ctrl+C to stop the test\n")
+
+ # Start web server (blocking call)
+ web_interface.run()
+
+ except KeyboardInterrupt:
+ print("\nTest interrupted by user")
+ except Exception as e:
+ print(f"Error during test: {e}")
+ finally:
+ print("Cleaning up resources...")
+ if "zed_stream" in locals():
+ zed_stream.cleanup()
+ if "pipeline" in locals():
+ pipeline.cleanup()
+ print("Test completed")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tests/test_manipulation_pipeline_single_frame.py b/tests/test_manipulation_pipeline_single_frame.py
index 061eb9035e..629ba4dbee 100644
--- a/tests/test_manipulation_pipeline_single_frame.py
+++ b/tests/test_manipulation_pipeline_single_frame.py
@@ -34,7 +34,7 @@
import open3d as o3d
from dimos.perception.pointcloud.utils import visualize_clustered_point_clouds, visualize_voxel_grid
-from dimos.perception.manip_aio_processer import ManipulationProcessor
+from dimos.manipulation.manip_aio_processer import ManipulationProcessor
from dimos.perception.pointcloud.utils import (
load_camera_matrix_from_yaml,
visualize_pcd,
diff --git a/tests/test_manipulation_pipeline_single_frame_lcm.py b/tests/test_manipulation_pipeline_single_frame_lcm.py
index 635f82c9c9..7b57887ddc 100644
--- a/tests/test_manipulation_pipeline_single_frame_lcm.py
+++ b/tests/test_manipulation_pipeline_single_frame_lcm.py
@@ -42,7 +42,7 @@
from lcm_msgs.sensor_msgs import CameraInfo as LCMCameraInfo
from dimos.perception.pointcloud.utils import visualize_clustered_point_clouds, visualize_voxel_grid
-from dimos.perception.manip_aio_processer import ManipulationProcessor
+from dimos.manipulation.manip_aio_processer import ManipulationProcessor
from dimos.perception.grasp_generation.utils import visualize_grasps_3d
from dimos.perception.pointcloud.utils import visualize_pcd
from dimos.utils.logging_config import setup_logger
diff --git a/tests/test_pick_and_place_module.py b/tests/test_pick_and_place_module.py
new file mode 100644
index 0000000000..6a8470863e
--- /dev/null
+++ b/tests/test_pick_and_place_module.py
@@ -0,0 +1,355 @@
+#!/usr/bin/env python3
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Run script for Piper Arm robot with pick and place functionality.
+Subscribes to visualization images and handles mouse/keyboard input.
+"""
+
+import cv2
+import sys
+import asyncio
+import threading
+import time
+import numpy as np
+from typing import Optional
+
+try:
+ import pyzed.sl as sl
+except ImportError:
+ print("Error: ZED SDK not installed.")
+ sys.exit(1)
+
+from dimos.robot.agilex.piper_arm import PiperArmRobot
+from dimos.utils.logging_config import setup_logger
+
+# Import LCM message types
+from dimos_lcm.sensor_msgs import Image
+from dimos.protocol.pubsub.lcmpubsub import LCM, Topic
+
+logger = setup_logger("dimos.tests.test_pick_and_place_module")
+
+# Global for mouse events
+mouse_click = None
+camera_mouse_click = None
+current_window = None
+pick_location = None # Store pick location
+place_location = None # Store place location
+place_mode = False # Track if we're in place selection mode
+
+
+def mouse_callback(event, x, y, _flags, param):
+ global mouse_click, camera_mouse_click
+ window_name = param
+ if event == cv2.EVENT_LBUTTONDOWN:
+ if window_name == "Camera Feed":
+ camera_mouse_click = (x, y)
+ else:
+ mouse_click = (x, y)
+
+
+class VisualizationNode:
+ """Node that subscribes to visualization images and handles user input."""
+
+ def __init__(self, robot: PiperArmRobot):
+ self.lcm = LCM()
+ self.latest_viz = None
+ self.latest_camera = None
+ self._running = False
+ self.robot = robot
+
+ # Subscribe to visualization topic
+ self.viz_topic = Topic("/manipulation/viz", Image)
+ self.camera_topic = Topic("/zed/color_image", Image)
+
+ def start(self):
+ """Start the visualization node."""
+ self._running = True
+ self.lcm.start()
+
+ # Subscribe to visualization topic
+ self.lcm.subscribe(self.viz_topic, self._on_viz_image)
+ # Subscribe to camera topic for point selection
+ self.lcm.subscribe(self.camera_topic, self._on_camera_image)
+
+ logger.info("Visualization node started")
+
+ def stop(self):
+ """Stop the visualization node."""
+ self._running = False
+ cv2.destroyAllWindows()
+
+ def _on_viz_image(self, msg: Image, topic: str):
+ """Handle visualization image messages."""
+ try:
+ # Convert LCM message to numpy array
+ data = np.frombuffer(msg.data, dtype=np.uint8)
+ if msg.encoding == "rgb8":
+ image = data.reshape((msg.height, msg.width, 3))
+ # Convert RGB to BGR for OpenCV
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+ self.latest_viz = image
+ except Exception as e:
+ logger.error(f"Error processing viz image: {e}")
+
+ def _on_camera_image(self, msg: Image, topic: str):
+ """Handle camera image messages."""
+ try:
+ # Convert LCM message to numpy array
+ data = np.frombuffer(msg.data, dtype=np.uint8)
+ if msg.encoding == "rgb8":
+ image = data.reshape((msg.height, msg.width, 3))
+ # Convert RGB to BGR for OpenCV
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+ self.latest_camera = image
+ except Exception as e:
+ logger.error(f"Error processing camera image: {e}")
+
+ def run_visualization(self):
+ """Run the visualization loop with user interaction."""
+ global mouse_click, camera_mouse_click, pick_location, place_location, place_mode
+
+ # Setup windows
+ cv2.namedWindow("Pick and Place")
+ cv2.setMouseCallback("Pick and Place", mouse_callback, "Pick and Place")
+
+ cv2.namedWindow("Camera Feed")
+ cv2.setMouseCallback("Camera Feed", mouse_callback, "Camera Feed")
+
+ print("=== Piper Arm Robot - Pick and Place ===")
+ print("Control mode: Module-based with LCM communication")
+ print("\nPICK AND PLACE WORKFLOW:")
+ print("1. Click on an object to select PICK location")
+ print("2. Click again to select PLACE location (auto pick & place)")
+ print("3. OR press 'p' after first click for pick-only task")
+ print("\nCONTROLS:")
+ print(" 'p' - Execute pick-only task (after selecting pick location)")
+ print(" 'r' - Reset everything")
+ print(" 'q' - Quit")
+ print(" 's' - SOFT STOP (emergency stop)")
+ print(" 'g' - RELEASE GRIPPER (open gripper)")
+ print(" 'SPACE' - EXECUTE target pose (manual override)")
+ print("\nNOTE: Click on objects in the Camera Feed window!")
+
+ while self._running:
+ # Show camera feed with status overlay
+ if self.latest_camera is not None:
+ display_image = self.latest_camera.copy()
+
+ # Add status text
+ status_text = ""
+ if pick_location is None:
+ status_text = "Click to select PICK location"
+ color = (0, 255, 0)
+ elif place_location is None:
+ status_text = "Click to select PLACE location (or press 'p' for pick-only)"
+ color = (0, 255, 255)
+ else:
+ status_text = "Executing pick and place..."
+ color = (255, 0, 255)
+
+ cv2.putText(
+ display_image, status_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2
+ )
+
+ # Draw pick location marker if set
+ if pick_location is not None:
+ # Simple circle marker
+ cv2.circle(display_image, pick_location, 10, (0, 255, 0), 2)
+ cv2.circle(display_image, pick_location, 2, (0, 255, 0), -1)
+
+ # Simple label
+ cv2.putText(
+ display_image,
+ "PICK",
+ (pick_location[0] + 15, pick_location[1] + 5),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.6,
+ (0, 255, 0),
+ 2,
+ )
+
+ # Draw place location marker if set
+ if place_location is not None:
+ # Simple circle marker
+ cv2.circle(display_image, place_location, 10, (0, 255, 255), 2)
+ cv2.circle(display_image, place_location, 2, (0, 255, 255), -1)
+
+ # Simple label
+ cv2.putText(
+ display_image,
+ "PLACE",
+ (place_location[0] + 15, place_location[1] + 5),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.6,
+ (0, 255, 255),
+ 2,
+ )
+
+ # Draw simple arrow between pick and place
+ if pick_location is not None:
+ cv2.arrowedLine(
+ display_image,
+ pick_location,
+ place_location,
+ (255, 255, 0),
+ 2,
+ tipLength=0.05,
+ )
+
+ cv2.imshow("Camera Feed", display_image)
+
+ # Show visualization if available
+ if self.latest_viz is not None:
+ cv2.imshow("Pick and Place", self.latest_viz)
+
+ # Handle keyboard input
+ key = cv2.waitKey(1) & 0xFF
+ if key != 255: # Key was pressed
+ if key == ord("q"):
+ logger.info("Quit requested")
+ self._running = False
+ break
+ elif key == ord("r"):
+ # Reset everything
+ pick_location = None
+ place_location = None
+ place_mode = False
+ logger.info("Reset pick and place selections")
+ # Also send reset to robot
+ action = self.robot.handle_keyboard_command("r")
+ if action:
+ logger.info(f"Action: {action}")
+ elif key == ord("p"):
+ # Execute pick-only task if pick location is set
+ if pick_location is not None:
+ logger.info(f"Executing pick-only task at {pick_location}")
+ result = self.robot.pick_and_place(
+ pick_location[0],
+ pick_location[1],
+ None, # No place location
+ None,
+ )
+ logger.info(f"Pick task started: {result}")
+ # Clear selection after sending
+ pick_location = None
+ place_location = None
+ else:
+ logger.warning("Please select a pick location first!")
+ else:
+ # Send keyboard command to robot
+ if key in [82, 84]: # Arrow keys
+ action = self.robot.handle_keyboard_command(str(key))
+ else:
+ action = self.robot.handle_keyboard_command(chr(key))
+ if action:
+ logger.info(f"Action: {action}")
+
+ # Handle mouse clicks
+ if camera_mouse_click:
+ x, y = camera_mouse_click
+
+ if pick_location is None:
+ # First click - set pick location
+ pick_location = (x, y)
+ logger.info(f"Pick location set at ({x}, {y})")
+ elif place_location is None:
+ # Second click - set place location and execute
+ place_location = (x, y)
+ logger.info(f"Place location set at ({x}, {y})")
+ logger.info(f"Executing pick at {pick_location} and place at ({x}, {y})")
+
+ # Start pick and place task with both locations
+ result = self.robot.pick_and_place(pick_location[0], pick_location[1], x, y)
+ logger.info(f"Pick and place task started: {result}")
+
+ # Clear all points after sending mission
+ pick_location = None
+ place_location = None
+
+ camera_mouse_click = None
+
+ # Handle mouse click from Pick and Place window (if viz is running)
+ elif mouse_click and self.latest_viz is not None:
+ # Similar logic for viz window clicks
+ x, y = mouse_click
+
+ if pick_location is None:
+ # First click - set pick location
+ pick_location = (x, y)
+ logger.info(f"Pick location set at ({x}, {y}) from viz window")
+ elif place_location is None:
+ # Second click - set place location and execute
+ place_location = (x, y)
+ logger.info(f"Place location set at ({x}, {y}) from viz window")
+ logger.info(f"Executing pick at {pick_location} and place at ({x}, {y})")
+
+ # Start pick and place task with both locations
+ result = self.robot.pick_and_place(pick_location[0], pick_location[1], x, y)
+ logger.info(f"Pick and place task started: {result}")
+
+ # Clear all points after sending mission
+ pick_location = None
+ place_location = None
+
+ mouse_click = None
+
+ time.sleep(0.03) # ~30 FPS
+
+
+async def run_piper_arm_with_viz():
+ """Run the Piper Arm robot with visualization."""
+ logger.info("Starting Piper Arm Robot")
+
+ # Create robot instance
+ robot = PiperArmRobot()
+
+ try:
+ # Start the robot
+ await robot.start()
+
+ # Give modules time to fully initialize
+ await asyncio.sleep(2)
+
+ # Create and start visualization node
+ viz_node = VisualizationNode(robot)
+ viz_node.start()
+
+ # Run visualization in separate thread
+ viz_thread = threading.Thread(target=viz_node.run_visualization, daemon=True)
+ viz_thread.start()
+
+ # Keep running until visualization stops
+ while viz_node._running:
+ await asyncio.sleep(0.1)
+
+ # Stop visualization
+ viz_node.stop()
+
+ except Exception as e:
+ logger.error(f"Error running robot: {e}")
+ import traceback
+
+ traceback.print_exc()
+
+ finally:
+ # Clean up
+ robot.stop()
+ logger.info("Robot stopped")
+
+
+if __name__ == "__main__":
+ # Run the robot
+ asyncio.run(run_piper_arm_with_viz())
diff --git a/tests/test_pick_and_place_skill.py b/tests/test_pick_and_place_skill.py
new file mode 100644
index 0000000000..40cf2c23b0
--- /dev/null
+++ b/tests/test_pick_and_place_skill.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Run script for Piper Arm robot with pick and place functionality.
+Uses hardcoded points and the PickAndPlace skill.
+"""
+
+import sys
+import asyncio
+
+try:
+ import pyzed.sl as sl # Required for ZED camera
+except ImportError:
+ print("Error: ZED SDK not installed.")
+ sys.exit(1)
+
+from dimos.robot.agilex.piper_arm import PiperArmRobot
+from dimos.skills.manipulation.pick_and_place import PickAndPlace
+from dimos.utils.logging_config import setup_logger
+
+logger = setup_logger("dimos.robot.agilex.run_robot")
+
+
+async def run_piper_arm():
+ """Run the Piper Arm robot with pick and place skill."""
+ logger.info("Starting Piper Arm Robot")
+
+ # Create robot instance
+ robot = PiperArmRobot()
+
+ try:
+ # Start the robot
+ await robot.start()
+
+ # Give modules time to fully initialize
+ await asyncio.sleep(3)
+
+ # Add the PickAndPlace skill to the robot's skill library
+ robot.skill_library.add(PickAndPlace)
+
+ logger.info("Robot initialized successfully")
+ print("\n=== Piper Arm Robot - Pick and Place Demo ===")
+ print("This demo uses hardcoded pick and place points.")
+ print("\nCommands:")
+ print(" 1. Run pick and place with hardcoded points")
+ print(" 2. Run pick-only with hardcoded point")
+ print(" r. Reset robot to idle")
+ print(" q. Quit")
+ print("")
+
+ running = True
+ while running:
+ try:
+ # Get user input
+ command = input("\nEnter command: ").strip().lower()
+
+ if command == "q":
+ logger.info("Quit requested")
+ running = False
+ break
+
+ elif command == "r" or command == "s":
+ logger.info("Resetting robot")
+ robot.handle_keyboard_command(command)
+
+ elif command == "1":
+ # Hardcoded pick and place points
+ # These should be adjusted based on your camera view
+ print("\nExecuting pick and place with hardcoded points...")
+
+ # Create and execute the skill
+ skill = PickAndPlace(
+ robot=robot,
+ object_query="labubu doll", # Will use visual detection
+ target_query="on the keyboard", # Will use visual detection
+ )
+
+ result = skill()
+
+ if result["success"]:
+ print(f"✓ {result['message']}")
+ else:
+ print(f"✗ Failed: {result.get('error', 'Unknown error')}")
+
+ elif command == "2":
+ # Pick-only with hardcoded point
+ print("\nExecuting pick-only with hardcoded point...")
+
+ # Create and execute the skill for pick-only
+ skill = PickAndPlace(
+ robot=robot,
+ object_query="labubu doll", # Will use visual detection
+ target_query=None, # No place target - pick only
+ )
+
+ result = skill()
+
+ if result["success"]:
+ print(f"✓ {result['message']}")
+ else:
+ print(f"✗ Failed: {result.get('error', 'Unknown error')}")
+
+ else:
+ print("Invalid command. Please try again.")
+
+ # Small delay to prevent CPU spinning
+ await asyncio.sleep(0.1)
+
+ except KeyboardInterrupt:
+ logger.info("Keyboard interrupt received")
+ running = False
+ break
+ except Exception as e:
+ logger.error(f"Error in command loop: {e}")
+ print(f"Error: {e}")
+
+ except Exception as e:
+ logger.error(f"Error running robot: {e}")
+ import traceback
+
+ traceback.print_exc()
+
+ finally:
+ # Clean up
+ logger.info("Shutting down robot...")
+ await robot.stop()
+ logger.info("Robot stopped")
+
+
+def main():
+ """Main entry point."""
+ print("Starting Piper Arm Robot...")
+ print("Note: The robot will use Qwen VLM to identify objects and locations")
+ print("based on the queries specified in the code.")
+
+ # Run the robot
+ asyncio.run(run_piper_arm())
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tests/test_pointcloud_filtering.py b/tests/test_pointcloud_filtering.py
index 308b4fc6ac..57a1cb5b00 100644
--- a/tests/test_pointcloud_filtering.py
+++ b/tests/test_pointcloud_filtering.py
@@ -23,7 +23,7 @@
from dimos.stream.stereo_camera_streams.zed import ZEDCameraStream
from dimos.web.robot_web_interface import RobotWebInterface
from dimos.utils.logging_config import logger
-from dimos.perception.manip_aio_pipeline import ManipulationPipeline
+from dimos.manipulation.manip_aio_pipeline import ManipulationPipeline
def main():
diff --git a/tests/test_zed_module.py b/tests/test_zed_module.py
new file mode 100644
index 0000000000..fbc99a54a4
--- /dev/null
+++ b/tests/test_zed_module.py
@@ -0,0 +1,276 @@
+#!/usr/bin/env python3
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test script for ZED Module with LCM visualization."""
+
+import asyncio
+import threading
+import time
+from typing import Optional
+import numpy as np
+import cv2
+
+from dimos import core
+from dimos.hardware.zed_camera import ZEDModule
+from dimos.protocol import pubsub
+from dimos.utils.logging_config import setup_logger
+from dimos.perception.common.utils import colorize_depth
+
+# Import LCM message types
+from dimos_lcm.sensor_msgs import Image as LCMImage
+from dimos_lcm.sensor_msgs import CameraInfo
+from dimos_lcm.geometry_msgs import PoseStamped
+from dimos.protocol.pubsub.lcmpubsub import LCM, Topic
+
+logger = setup_logger("test_zed_module")
+
+
+class ZEDVisualizationNode:
+ """Node that subscribes to ZED topics and visualizes the data."""
+
+ def __init__(self):
+ self.lcm = LCM()
+ self.latest_color = None
+ self.latest_depth = None
+ self.latest_pose = None
+ self.camera_info = None
+ self._running = False
+
+ # Subscribe to topics
+ self.color_topic = Topic("/zed/color_image", LCMImage)
+ self.depth_topic = Topic("/zed/depth_image", LCMImage)
+ self.camera_info_topic = Topic("/zed/camera_info", CameraInfo)
+ self.pose_topic = Topic("/zed/pose", PoseStamped)
+
+ def start(self):
+ """Start the visualization node."""
+ self._running = True
+ self.lcm.start()
+
+ # Subscribe to topics
+ self.lcm.subscribe(self.color_topic, self._on_color_image)
+ self.lcm.subscribe(self.depth_topic, self._on_depth_image)
+ self.lcm.subscribe(self.camera_info_topic, self._on_camera_info)
+ self.lcm.subscribe(self.pose_topic, self._on_pose)
+
+ logger.info("Visualization node started, subscribed to ZED topics")
+
+ def stop(self):
+ """Stop the visualization node."""
+ self._running = False
+ cv2.destroyAllWindows()
+
+ def _on_color_image(self, msg: LCMImage, topic: str):
+ """Handle color image messages."""
+ try:
+ # Convert LCM message to numpy array
+ data = np.frombuffer(msg.data, dtype=np.uint8)
+
+ if msg.encoding == "rgb8":
+ image = data.reshape((msg.height, msg.width, 3))
+ # Convert RGB to BGR for OpenCV
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+ elif msg.encoding == "mono8":
+ image = data.reshape((msg.height, msg.width))
+ else:
+ logger.warning(f"Unsupported encoding: {msg.encoding}")
+ return
+
+ self.latest_color = image
+ logger.debug(f"Received color image: {msg.width}x{msg.height}")
+
+ except Exception as e:
+ logger.error(f"Error processing color image: {e}")
+
+ def _on_depth_image(self, msg: LCMImage, topic: str):
+ """Handle depth image messages."""
+ try:
+ # Convert LCM message to numpy array
+ if msg.encoding == "32FC1":
+ data = np.frombuffer(msg.data, dtype=np.float32)
+ depth = data.reshape((msg.height, msg.width))
+ else:
+ logger.warning(f"Unsupported depth encoding: {msg.encoding}")
+ return
+
+ self.latest_depth = depth
+ logger.debug(f"Received depth image: {msg.width}x{msg.height}")
+
+ except Exception as e:
+ logger.error(f"Error processing depth image: {e}")
+
+ def _on_camera_info(self, msg: CameraInfo, topic: str):
+ """Handle camera info messages."""
+ self.camera_info = msg
+ logger.info(
+ f"Received camera info: {msg.width}x{msg.height}, distortion model: {msg.distortion_model}"
+ )
+
+ def _on_pose(self, msg: PoseStamped, topic: str):
+ """Handle pose messages."""
+ self.latest_pose = msg
+ pos = msg.pose.position
+ ori = msg.pose.orientation
+ logger.debug(
+ f"Pose: pos=({pos.x:.2f}, {pos.y:.2f}, {pos.z:.2f}), "
+ + f"ori=({ori.x:.2f}, {ori.y:.2f}, {ori.z:.2f}, {ori.w:.2f})"
+ )
+
+ def visualize(self):
+ """Run visualization loop."""
+ while self._running:
+ # Create visualization
+ vis_images = []
+
+ # Color image
+ if self.latest_color is not None:
+ color_vis = self.latest_color.copy()
+
+ # Add pose text if available
+ if self.latest_pose is not None:
+ pos = self.latest_pose.pose.position
+ text = f"Pose: ({pos.x:.2f}, {pos.y:.2f}, {pos.z:.2f})"
+ cv2.putText(
+ color_vis, text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 255, 0), 2
+ )
+
+ vis_images.append(("ZED Color", color_vis))
+
+ # Depth image
+ if self.latest_depth is not None:
+ depth_colorized = colorize_depth(self.latest_depth, max_depth=5.0)
+ if depth_colorized is not None:
+ # Convert RGB to BGR for OpenCV
+ depth_colorized = cv2.cvtColor(depth_colorized, cv2.COLOR_RGB2BGR)
+
+ # Add depth stats
+ valid_mask = np.isfinite(self.latest_depth) & (self.latest_depth > 0)
+ if np.any(valid_mask):
+ min_depth = np.min(self.latest_depth[valid_mask])
+ max_depth = np.max(self.latest_depth[valid_mask])
+ mean_depth = np.mean(self.latest_depth[valid_mask])
+
+ text = f"Depth: min={min_depth:.2f}m, max={max_depth:.2f}m, mean={mean_depth:.2f}m"
+ cv2.putText(
+ depth_colorized,
+ text,
+ (10, 30),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.5,
+ (255, 255, 255),
+ 1,
+ )
+
+ vis_images.append(("ZED Depth", depth_colorized))
+
+ # Show windows
+ for name, image in vis_images:
+ cv2.imshow(name, image)
+
+ # Handle key press
+ key = cv2.waitKey(1) & 0xFF
+ if key == ord("q"):
+ logger.info("Quit requested")
+ self._running = False
+ break
+ elif key == ord("s"):
+ # Save images
+ if self.latest_color is not None:
+ cv2.imwrite("zed_color.png", self.latest_color)
+ logger.info("Saved color image to zed_color.png")
+ if self.latest_depth is not None:
+ np.save("zed_depth.npy", self.latest_depth)
+ logger.info("Saved depth data to zed_depth.npy")
+
+ time.sleep(0.03) # ~30 FPS
+
+
+async def test_zed_module():
+ """Test the ZED Module with visualization."""
+ logger.info("Starting ZED Module test")
+
+ # Start Dask
+ dimos = core.start(1)
+
+ # Enable LCM auto-configuration
+ pubsub.lcm.autoconf()
+
+ try:
+ # Deploy ZED module
+ logger.info("Deploying ZED module...")
+ zed = dimos.deploy(
+ ZEDModule,
+ camera_id=0,
+ resolution="HD720",
+ depth_mode="NEURAL",
+ fps=30,
+ enable_tracking=True,
+ publish_rate=10.0, # 10 Hz for testing
+ frame_id="zed_camera",
+ )
+
+ # Configure LCM transports
+ zed.color_image.transport = core.LCMTransport("/zed/color_image", LCMImage)
+ zed.depth_image.transport = core.LCMTransport("/zed/depth_image", LCMImage)
+ zed.camera_info.transport = core.LCMTransport("/zed/camera_info", CameraInfo)
+ zed.pose.transport = core.LCMTransport("/zed/pose", PoseStamped)
+
+ # Print module info
+ logger.info("ZED Module configured:")
+ print(zed.io().result())
+
+ # Start ZED module
+ logger.info("Starting ZED module...")
+ zed.start()
+
+ # Give module time to initialize
+ await asyncio.sleep(2)
+
+ # Create and start visualization node
+ viz_node = ZEDVisualizationNode()
+ viz_node.start()
+
+ # Run visualization in separate thread
+ viz_thread = threading.Thread(target=viz_node.visualize, daemon=True)
+ viz_thread.start()
+
+ logger.info("ZED Module running. Press 'q' in image window to quit, 's' to save images.")
+
+ # Keep running until visualization stops
+ while viz_node._running:
+ await asyncio.sleep(0.1)
+
+ # Stop ZED module
+ logger.info("Stopping ZED module...")
+ zed.stop()
+
+ # Stop visualization
+ viz_node.stop()
+
+ except Exception as e:
+ logger.error(f"Error in test: {e}")
+ import traceback
+
+ traceback.print_exc()
+
+ finally:
+ # Clean up
+ dimos.close()
+ logger.info("Test completed")
+
+
+if __name__ == "__main__":
+ # Run the test
+ asyncio.run(test_zed_module())