diff --git a/data/.lfs/person.tar.gz b/data/.lfs/person.tar.gz new file mode 100644 index 0000000000..1f32d0db58 --- /dev/null +++ b/data/.lfs/person.tar.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:332c3196c6436e7d4c2b7e3314b4a4055865ef358b2e9cf3c8ddd7e173f39b93 +size 2535758 diff --git a/dimos/agents/skills/person_follow.py b/dimos/agents/skills/person_follow.py new file mode 100644 index 0000000000..0d4420632c --- /dev/null +++ b/dimos/agents/skills/person_follow.py @@ -0,0 +1,248 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from threading import Event, RLock +import time +from typing import TYPE_CHECKING + +import numpy as np +from reactivex.disposable import Disposable + +from dimos.core.core import rpc +from dimos.core.global_config import GlobalConfig +from dimos.core.skill_module import SkillModule +from dimos.core.stream import In, Out +from dimos.models.qwen.video_query import BBox +from dimos.models.segmentation.edge_tam import EdgeTAMProcessor +from dimos.models.vl.qwen import QwenVlModel +from dimos.msgs.geometry_msgs import Twist +from dimos.msgs.sensor_msgs import CameraInfo, Image, PointCloud2 +from dimos.navigation.visual.query import get_object_bbox_from_image +from dimos.navigation.visual_servoing.detection_navigation import DetectionNavigation +from dimos.navigation.visual_servoing.visual_servoing_2d import VisualServoing2D +from dimos.protocol.skill.skill import skill +from dimos.utils.logging_config import setup_logger + +if TYPE_CHECKING: + from dimos.models.vl.base import VlModel + +logger = setup_logger() + + +class PersonFollowSkillContainer(SkillModule): + """Skill container for following a person. + + This skill uses: + - A VL model (QwenVlModel) to initially detect a person from a text description. + - EdgeTAM for continuous tracking across frames. + - Visual servoing OR 3D navigation to control robot movement towards the person. + - Does not do obstacle avoidance; assumes a clear path. + """ + + color_image: In[Image] + global_map: In[PointCloud2] + cmd_vel: Out[Twist] + + _frequency: float = 20.0 # Hz - control loop frequency + _max_lost_frames: int = 15 # number of frames to wait before declaring person lost + + def __init__( + self, + camera_info: CameraInfo, + global_config: GlobalConfig, + use_3d_navigation: bool = False, + ) -> None: + super().__init__() + self._global_config: GlobalConfig = global_config + self._use_3d_navigation: bool = use_3d_navigation + self._latest_image: Image | None = None + self._latest_pointcloud: PointCloud2 | None = None + self._vl_model: VlModel = QwenVlModel() + self._tracker: EdgeTAMProcessor | None = None + self._should_stop: Event = Event() + self._lock = RLock() + + # Use MuJoCo camera intrinsics in simulation mode + if self._global_config.simulation: + from dimos.robot.unitree_webrtc.mujoco_connection import MujocoConnection + + camera_info = MujocoConnection.camera_info_static + + self._camera_info = camera_info + self._visual_servo = VisualServoing2D(camera_info, self._global_config.simulation) + self._detection_navigation = DetectionNavigation(self.tf, camera_info) + + @rpc + def start(self) -> None: + super().start() + self._disposables.add(Disposable(self.color_image.subscribe(self._on_color_image))) + if self._use_3d_navigation: + self._disposables.add(Disposable(self.global_map.subscribe(self._on_pointcloud))) + + @rpc + def stop(self) -> None: + self._stop_following() + + with self._lock: + if self._tracker is not None: + self._tracker.stop() + self._tracker = None + + self._vl_model.stop() + super().stop() + + @skill() + def follow_person(self, query: str) -> str: + """Follow a person matching the given description using visual servoing. + + The robot will continuously track and follow the person, while keeping + them centered in the camera view. + + Args: + query: Description of the person to follow (e.g., "man with blue shirt") + + Returns: + Status message indicating the result of the following action. + + Example: + follow_person("man with blue shirt") + follow_person("person in the doorway") + """ + + self._stop_following() + + self._should_stop.clear() + + with self._lock: + latest_image = self._latest_image + + if latest_image is None: + return "No image available to detect person." + + initial_bbox = get_object_bbox_from_image( + self._vl_model, + latest_image, + query, + ) + + if initial_bbox is None: + return f"Could not find '{query}' in the current view." + + return self._follow_loop(query, initial_bbox) + + @skill() + def stop_following(self) -> str: + """Stop following the current person. + + Returns: + Confirmation message. + """ + self._stop_following() + + self.cmd_vel.publish(Twist.zero()) + + return "Stopped following." + + def _on_color_image(self, image: Image) -> None: + with self._lock: + self._latest_image = image + + def _on_pointcloud(self, pointcloud: PointCloud2) -> None: + with self._lock: + self._latest_pointcloud = pointcloud + + def _follow_loop(self, query: str, initial_bbox: BBox) -> str: + x1, y1, x2, y2 = initial_bbox + box = np.array([x1, y1, x2, y2], dtype=np.float32) + + with self._lock: + if self._tracker is None: + self._tracker = EdgeTAMProcessor() + tracker = self._tracker + latest_image = self._latest_image + if latest_image is None: + return "No image available to start tracking." + + initial_detections = tracker.init_track( + image=latest_image, + box=box, + obj_id=1, + ) + + if len(initial_detections) == 0: + self.cmd_vel.publish(Twist.zero()) + return f"EdgeTAM failed to segment '{query}'." + + logger.info(f"EdgeTAM initialized with {len(initial_detections)} detections") + + lost_count = 0 + period = 1.0 / self._frequency + next_time = time.monotonic() + + while not self._should_stop.is_set(): + next_time += period + + with self._lock: + latest_image = self._latest_image + assert latest_image is not None + + detections = tracker.process_image(latest_image) + + if len(detections) == 0: + self.cmd_vel.publish(Twist.zero()) + + lost_count += 1 + if lost_count > self._max_lost_frames: + self.cmd_vel.publish(Twist.zero()) + return f"Lost track of '{query}'. Stopping." + else: + lost_count = 0 + best_detection = max(detections.detections, key=lambda d: d.bbox_2d_volume()) + + if self._use_3d_navigation: + with self._lock: + pointcloud = self._latest_pointcloud + if pointcloud is None: + self.cmd_vel.publish(Twist.zero()) + return "No pointcloud available for 3D navigation. Stopping." + twist = self._detection_navigation.compute_twist_for_detection_3d( + pointcloud, + best_detection, + latest_image, + ) + if twist is None: + self.cmd_vel.publish(Twist.zero()) + return f"3D navigation failed for '{query}'. Stopping." + else: + twist = self._visual_servo.compute_twist( + best_detection.bbox, + latest_image.width, + ) + self.cmd_vel.publish(twist) + + now = time.monotonic() + sleep_duration = next_time - now + if sleep_duration > 0: + time.sleep(sleep_duration) + + self.cmd_vel.publish(Twist.zero()) + return "Stopped following as requested." + + def _stop_following(self) -> None: + self._should_stop.set() + + +person_follow_skill = PersonFollowSkillContainer.blueprint + +__all__ = ["PersonFollowSkillContainer", "person_follow_skill"] diff --git a/dimos/e2e_tests/conftest.py b/dimos/e2e_tests/conftest.py index 12d3e407ae..46b92151e9 100644 --- a/dimos/e2e_tests/conftest.py +++ b/dimos/e2e_tests/conftest.py @@ -63,8 +63,8 @@ def fun(*, points: list[tuple[float, float, float]], fail_message: str) -> None: def start_blueprint() -> Iterator[Callable[[str], DimosCliCall]]: dimos_robot_call = DimosCliCall() - def set_name_and_start(demo_name: str) -> DimosCliCall: - dimos_robot_call.demo_name = demo_name + def set_name_and_start(*demo_args: str) -> DimosCliCall: + dimos_robot_call.demo_args = list(demo_args) dimos_robot_call.start() return dimos_robot_call diff --git a/dimos/e2e_tests/dimos_cli_call.py b/dimos/e2e_tests/dimos_cli_call.py index 07def58782..956b30a086 100644 --- a/dimos/e2e_tests/dimos_cli_call.py +++ b/dimos/e2e_tests/dimos_cli_call.py @@ -19,18 +19,16 @@ class DimosCliCall: process: subprocess.Popen[bytes] | None - demo_name: str | None = None + demo_args: list[str] | None = None def __init__(self) -> None: self.process = None def start(self) -> None: - if self.demo_name is None: - raise ValueError("Demo name must be set before starting the process.") + if self.demo_args is None: + raise ValueError("Demo args must be set before starting the process.") - self.process = subprocess.Popen( - ["dimos", "--simulation", "run", self.demo_name], - ) + self.process = subprocess.Popen(["dimos", "--simulation", *self.demo_args]) def stop(self) -> None: if self.process is None: diff --git a/dimos/e2e_tests/test_dimos_cli_e2e.py b/dimos/e2e_tests/test_dimos_cli_e2e.py index 7571e113ad..0c2552905c 100644 --- a/dimos/e2e_tests/test_dimos_cli_e2e.py +++ b/dimos/e2e_tests/test_dimos_cli_e2e.py @@ -26,7 +26,7 @@ def test_dimos_skills(lcm_spy, start_blueprint, human_input) -> None: lcm_spy.save_topic("/rpc/DemoCalculatorSkill/sum_numbers/req") lcm_spy.save_topic("/rpc/DemoCalculatorSkill/sum_numbers/res") - start_blueprint("demo-skill") + start_blueprint("run", "demo-skill") lcm_spy.wait_for_saved_topic("/rpc/DemoCalculatorSkill/set_AgentSpec_register_skills/res") lcm_spy.wait_for_saved_topic("/rpc/HumanInput/start/res") diff --git a/dimos/e2e_tests/test_person_follow.py b/dimos/e2e_tests/test_person_follow.py new file mode 100644 index 0000000000..663dee420e --- /dev/null +++ b/dimos/e2e_tests/test_person_follow.py @@ -0,0 +1,85 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Callable, Generator +import os +import threading +import time + +import pytest + +from dimos.e2e_tests.dimos_cli_call import DimosCliCall +from dimos.e2e_tests.lcm_spy import LcmSpy +from dimos.simulation.mujoco.person_on_track import PersonTrackPublisher + +StartPersonTrack = Callable[[list[tuple[float, float]]], None] + + +@pytest.fixture +def start_person_track() -> Generator[StartPersonTrack, None, None]: + publisher: PersonTrackPublisher | None = None + stop_event = threading.Event() + thread: threading.Thread | None = None + + def start(track: list[tuple[float, float]]) -> None: + nonlocal publisher, thread + publisher = PersonTrackPublisher(track) + + def run_person_track() -> None: + while not stop_event.is_set(): + publisher.tick() + time.sleep(1 / 60) + + thread = threading.Thread(target=run_person_track, daemon=True) + thread.start() + + yield start + + stop_event.set() + if thread is not None: + thread.join(timeout=1.0) + if publisher is not None: + publisher.stop() + + +@pytest.mark.skipif(bool(os.getenv("CI")), reason="LCM spy doesn't work in CI.") +@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set.") +@pytest.mark.e2e +def test_person_follow( + lcm_spy: LcmSpy, + start_blueprint: Callable[[str], DimosCliCall], + human_input: Callable[[str], None], + start_person_track: StartPersonTrack, +) -> None: + start_blueprint("--mujoco-start-pos", "-6.18 0.96", "run", "unitree-go2-agentic") + + lcm_spy.save_topic("/rpc/HumanInput/start/res") + lcm_spy.wait_for_saved_topic("/rpc/HumanInput/start/res", timeout=120.0) + lcm_spy.save_topic("/agent") + lcm_spy.wait_for_saved_topic_content("/agent", b"AIMessage", timeout=120.0) + + time.sleep(5) + + start_person_track( + [ + (-2.60, 1.28), + (4.80, 0.21), + (4.14, -6.0), + (0.59, -3.79), + (-3.35, -0.51), + ] + ) + human_input("follow the person in beige pants") + + lcm_spy.wait_until_odom_position(4.2, -3, threshold=1.5) diff --git a/dimos/e2e_tests/test_spatial_memory.py b/dimos/e2e_tests/test_spatial_memory.py index 5029f46525..171f638f19 100644 --- a/dimos/e2e_tests/test_spatial_memory.py +++ b/dimos/e2e_tests/test_spatial_memory.py @@ -32,7 +32,7 @@ def test_spatial_memory_navigation( human_input: Callable[[str], None], follow_points: Callable[..., None], ) -> None: - start_blueprint("unitree-go2-agentic") + start_blueprint("run", "unitree-go2-agentic") lcm_spy.save_topic("/rpc/HumanInput/start/res") lcm_spy.wait_for_saved_topic("/rpc/HumanInput/start/res", timeout=120.0) diff --git a/dimos/models/segmentation/configs/edgetam.yaml b/dimos/models/segmentation/configs/edgetam.yaml new file mode 100644 index 0000000000..6fe21c99df --- /dev/null +++ b/dimos/models/segmentation/configs/edgetam.yaml @@ -0,0 +1,138 @@ +# @package _global_ + +# Model +model: + _target_: sam2.sam2_video_predictor.SAM2VideoPredictor + image_encoder: + _target_: sam2.modeling.backbones.image_encoder.ImageEncoder + scalp: 1 + trunk: + _target_: sam2.modeling.backbones.timm.TimmBackbone + name: repvit_m1.dist_in1k + features: + - layer0 + - layer1 + - layer2 + - layer3 + neck: + _target_: sam2.modeling.backbones.image_encoder.FpnNeck + position_encoding: + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + num_pos_feats: 256 + normalize: true + scale: null + temperature: 10000 + d_model: 256 + backbone_channel_list: [384, 192, 96, 48] + fpn_top_down_levels: [2, 3] # output level 0 and 1 directly use the backbone features + fpn_interp_model: nearest + + memory_attention: + _target_: sam2.modeling.memory_attention.MemoryAttention + d_model: 256 + pos_enc_at_input: true + layer: + _target_: sam2.modeling.memory_attention.MemoryAttentionLayer + activation: relu + dim_feedforward: 2048 + dropout: 0.1 + pos_enc_at_attn: false + self_attention: + _target_: sam2.modeling.sam.transformer.RoPEAttention + rope_theta: 10000.0 + feat_sizes: [32, 32] + embedding_dim: 256 + num_heads: 1 + downsample_rate: 1 + dropout: 0.1 + d_model: 256 + pos_enc_at_cross_attn_keys: true + pos_enc_at_cross_attn_queries: false + cross_attention: + _target_: sam2.modeling.sam.transformer.RoPEAttentionv2 + rope_theta: 10000.0 + q_sizes: [64, 64] + k_sizes: [16, 16] + embedding_dim: 256 + num_heads: 1 + downsample_rate: 1 + dropout: 0.1 + kv_in_dim: 64 + num_layers: 2 + + memory_encoder: + _target_: sam2.modeling.memory_encoder.MemoryEncoder + out_dim: 64 + position_encoding: + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + num_pos_feats: 64 + normalize: true + scale: null + temperature: 10000 + mask_downsampler: + _target_: sam2.modeling.memory_encoder.MaskDownSampler + kernel_size: 3 + stride: 2 + padding: 1 + fuser: + _target_: sam2.modeling.memory_encoder.Fuser + layer: + _target_: sam2.modeling.memory_encoder.CXBlock + dim: 256 + kernel_size: 7 + padding: 3 + layer_scale_init_value: 1e-6 + use_dwconv: True # depth-wise convs + num_layers: 2 + + spatial_perceiver: + _target_: sam2.modeling.perceiver.PerceiverResampler + depth: 2 + dim: 64 + dim_head: 64 + heads: 1 + ff_mult: 4 + hidden_dropout_p: 0. + attention_dropout_p: 0. + pos_enc_at_key_value: true # implicit pos + concat_kv_latents: false + num_latents: 256 + num_latents_2d: 256 + position_encoding: + _target_: sam2.modeling.position_encoding.PositionEmbeddingSine + num_pos_feats: 64 + normalize: true + scale: null + temperature: 10000 + use_self_attn: true + + num_maskmem: 7 + image_size: 1024 + # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask + sigmoid_scale_for_mem_enc: 20.0 + sigmoid_bias_for_mem_enc: -10.0 + use_mask_input_as_output_without_sam: true + # Memory + directly_add_no_mem_embed: true + # use high-resolution feature map in the SAM mask decoder + use_high_res_features_in_sam: true + # output 3 masks on the first click on initial conditioning frames + multimask_output_in_sam: true + # SAM heads + iou_prediction_use_sigmoid: True + # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder + use_obj_ptrs_in_encoder: true + add_tpos_enc_to_obj_ptrs: false + only_obj_ptrs_in_the_past_for_eval: true + # object occlusion prediction + pred_obj_scores: true + pred_obj_scores_mlp: true + fixed_no_obj_ptr: true + # multimask tracking settings + multimask_output_for_tracking: true + use_multimask_token_for_obj_ptr: true + multimask_min_pt_num: 0 + multimask_max_pt_num: 1 + use_mlp_for_obj_ptr_proj: true + # Compilation flag + compile_image_encoder: false diff --git a/dimos/models/segmentation/edge_tam.py b/dimos/models/segmentation/edge_tam.py new file mode 100644 index 0000000000..ba351be130 --- /dev/null +++ b/dimos/models/segmentation/edge_tam.py @@ -0,0 +1,269 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections.abc import Generator +from contextlib import contextmanager +import os +from pathlib import Path +import shutil +import tempfile +from typing import TYPE_CHECKING, Any, TypedDict + +import cv2 +from hydra.utils import instantiate # type: ignore[import-not-found] +import numpy as np +from numpy.typing import NDArray +from omegaconf import OmegaConf # type: ignore[import-not-found] +from PIL import Image as PILImage +import torch + +from dimos.msgs.sensor_msgs import Image +from dimos.perception.detection.detectors.types import Detector +from dimos.perception.detection.type import ImageDetections2D +from dimos.perception.detection.type.detection2d.seg import Detection2DSeg +from dimos.utils.data import get_data +from dimos.utils.logging_config import setup_logger + +if TYPE_CHECKING: + from sam2.sam2_video_predictor import SAM2VideoPredictor # type: ignore[import-untyped] + +os.environ['TQDM_DISABLE'] = '1' + +logger = setup_logger() + + +class SAM2InferenceState(TypedDict): + images: list[torch.Tensor | None] + num_frames: int + cached_features: dict[int, Any] + + +class EdgeTAMProcessor(Detector): + _predictor: "SAM2VideoPredictor" + _inference_state: SAM2InferenceState | None + _frame_count: int + _is_tracking: bool + _buffer_size: int + + def __init__( + self, + ) -> None: + local_config_path = Path(__file__).parent / "configs" / "edgetam.yaml" + + if not local_config_path.exists(): + raise FileNotFoundError(f"EdgeTAM config not found at {local_config_path}") + + if not torch.cuda.is_available(): + raise RuntimeError("EdgeTAM requires a CUDA-capable GPU") + + cfg = OmegaConf.load(local_config_path) + + overrides = { + "model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability": True, + "model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta": 0.05, + "model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh": 0.98, + "model.binarize_mask_from_pts_for_mem_enc": True, + "model.fill_hole_area": 8, + } + + for key, value in overrides.items(): + OmegaConf.update(cfg, key, value) + + if cfg.model._target_ != "sam2.sam2_video_predictor.SAM2VideoPredictor": + logger.warning( + f"Config target is {cfg.model._target_}, forcing SAM2VideoPredictor" + ) + cfg.model._target_ = "sam2.sam2_video_predictor.SAM2VideoPredictor" + + self._predictor = instantiate(cfg.model, _recursive_=True) + + ckpt_path = str(get_data("models_edgetam") / "edgetam.pt") + + sd = torch.load(ckpt_path, map_location="cpu", weights_only=True)["model"] + missing_keys, unexpected_keys = self._predictor.load_state_dict(sd) + if missing_keys: + raise RuntimeError("Missing keys in checkpoint") + if unexpected_keys: + raise RuntimeError("Unexpected keys in checkpoint") + + self._predictor = self._predictor.to("cuda") + self._predictor.eval() + + self._inference_state = None + self._frame_count = 0 + self._is_tracking = False + self._buffer_size = 100 # Keep last N frames in memory to avoid OOM + + def _prepare_frame(self, image: Image) -> torch.Tensor: + """Prepare frame for SAM2 (resize, normalize, convert to tensor).""" + + cv_image = image.to_opencv() + rgb_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB) + pil_image = PILImage.fromarray(rgb_image) + + img_np = np.array( + pil_image.resize((self._predictor.image_size, self._predictor.image_size)) + ) + img_np = img_np.astype(np.float32) / 255.0 + + img_mean = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape(1, 1, 3) + img_std = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape(1, 1, 3) + img_np -= img_mean + img_np /= img_std + + img_tensor = torch.from_numpy(img_np).permute(2, 0, 1).float() + img_tensor = img_tensor.cuda() + + return img_tensor + + def init_track( + self, + image: Image, + points: NDArray[np.floating[Any]] | None = None, + labels: NDArray[np.integer[Any]] | None = None, + box: NDArray[np.floating[Any]] | None = None, + obj_id: int = 1, + ) -> ImageDetections2D: + """Initialize tracking with a prompt (points or box). + + Args: + image: Initial frame to start tracking from + points: Point prompts for segmentation (Nx2 array of [x, y] coordinates) + labels: Labels for points (1 = foreground, 0 = background) + box: Bounding box prompt in [x1, y1, x2, y2] format + obj_id: Object ID for tracking + + Returns: + ImageDetections2D with initial segmentation mask + """ + if self._inference_state is not None: + self.stop() + + self._frame_count = 0 + + with _temp_dir_context(image) as video_path: + self._inference_state = self._predictor.init_state(video_path=video_path) + + self._predictor.reset_state(self._inference_state) + + if torch.is_tensor(self._inference_state["images"]): + self._inference_state["images"] = [self._inference_state["images"][0]] + + self._is_tracking = True + + if points is not None: + points = points.astype(np.float32) + if labels is not None: + labels = labels.astype(np.int32) + if box is not None: + box = box.astype(np.float32) + + with torch.no_grad(): + _, out_obj_ids, out_mask_logits = self._predictor.add_new_points_or_box( + inference_state=self._inference_state, + frame_idx=0, + obj_id=obj_id, + points=points, + labels=labels, + box=box, + ) + + return self._process_results(image, out_obj_ids, out_mask_logits) + + def process_image(self, image: Image) -> ImageDetections2D: + """Process a new video frame and propagate tracking. + + Args: + image: New frame to process + + Returns: + ImageDetections2D with tracked object segmentation masks + """ + if not self._is_tracking or self._inference_state is None: + return ImageDetections2D(image=image) + + self._frame_count += 1 + + # Append new frame to inference state + new_frame_tensor = self._prepare_frame(image) + self._inference_state["images"].append(new_frame_tensor) + self._inference_state["num_frames"] += 1 + + # Memory management + cached_features = self._inference_state["cached_features"] + if len(cached_features) > self._buffer_size: + oldest_frame = min(cached_features.keys()) + if oldest_frame < self._frame_count - self._buffer_size: + del cached_features[oldest_frame] + + if len(self._inference_state["images"]) > self._buffer_size + 10: + idx_to_drop = self._frame_count - self._buffer_size - 5 + if idx_to_drop >= 0 and idx_to_drop < len(self._inference_state["images"]): + if self._inference_state["images"][idx_to_drop] is not None: + self._inference_state["images"][idx_to_drop] = None + + detections: ImageDetections2D = ImageDetections2D(image=image) + + with torch.no_grad(): + for out_frame_idx, out_obj_ids, out_mask_logits in self._predictor.propagate_in_video( + self._inference_state, start_frame_idx=self._frame_count, max_frame_num_to_track=1 + ): + if out_frame_idx == self._frame_count: + return self._process_results(image, out_obj_ids, out_mask_logits) + + return detections + + def _process_results( + self, + image: Image, + obj_ids: list[int], + mask_logits: torch.Tensor | NDArray[np.floating[Any]], + ) -> ImageDetections2D: + detections: ImageDetections2D = ImageDetections2D(image=image) + + if len(obj_ids) == 0: + return detections + + if isinstance(mask_logits, torch.Tensor): + mask_logits = mask_logits.cpu().numpy() + + for i, obj_id in enumerate(obj_ids): + mask = mask_logits[i] + seg = Detection2DSeg.from_sam2_result( + mask=mask, + obj_id=obj_id, + image=image, + name="object", + ) + + if seg.is_valid(): + detections.detections.append(seg) + + return detections + + def stop(self) -> None: + self._is_tracking = False + self._inference_state = None + + +@contextmanager +def _temp_dir_context(image: Image) -> Generator[str, None, None]: + path = tempfile.mkdtemp() + + image.save(f"{path}/00000.jpg") + + try: + yield path + finally: + shutil.rmtree(path) diff --git a/dimos/navigation/visual_servoing/detection_navigation.py b/dimos/navigation/visual_servoing/detection_navigation.py new file mode 100644 index 0000000000..5f89bd1faa --- /dev/null +++ b/dimos/navigation/visual_servoing/detection_navigation.py @@ -0,0 +1,208 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dimos_lcm.sensor_msgs import CameraInfo as DimosLcmCameraInfo +import numpy as np + +from dimos.msgs.geometry_msgs import Transform, Twist, Vector3 +from dimos.msgs.sensor_msgs import CameraInfo, Image, PointCloud2 +from dimos.perception.detection.type.detection2d.bbox import Detection2DBBox +from dimos.perception.detection.type.detection3d import Detection3DPC +from dimos.protocol.tf import LCMTF +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class DetectionNavigation: + _target_distance_3d: float = 1.5 # meters to maintain from person + _min_distance_3d: float = 0.8 # meters before backing up + _max_linear_speed_3d: float = 0.5 # m/s + _max_angular_speed_3d: float = 0.8 # rad/s + _linear_gain_3d: float = 0.8 + _angular_gain_3d: float = 1.5 + + _tf: LCMTF + _camera_info: CameraInfo + + def __init__(self, tf: LCMTF, camera_info: CameraInfo) -> None: + self._tf = tf + self._camera_info = camera_info + + def compute_twist_for_detection_3d( + self, pointcloud: PointCloud2, detection: Detection2DBBox, image: Image + ) -> Twist | None: + """Project a 2D detection to 3D using pointcloud and compute navigation twist. + + Args: + detection: 2D detection with bounding box + image: Current image frame + + Returns: + Twist command to navigate towards the detection's 3D position. + """ + + # Get transform from world frame to camera optical frame + world_to_optical = self._tf.get( + "camera_optical", pointcloud.frame_id, image.ts, time_tolerance=1.0 + ) + if world_to_optical is None: + logger.warning("Could not get camera transform") + return None + + lcm_camera_info = DimosLcmCameraInfo() + lcm_camera_info.K = self._camera_info.K + lcm_camera_info.width = self._camera_info.width + lcm_camera_info.height = self._camera_info.height + + # Project to 3D using the pointcloud + detection_3d = Detection3DPC.from_2d( + det=detection, + world_pointcloud=pointcloud, + camera_info=lcm_camera_info, + world_to_optical_transform=world_to_optical, + filters=[], # Skip filtering for faster processing in follow loop + ) + + if detection_3d is None: + logger.warning("3D projection failed") + return None + + # Get robot position to compute robust target + robot_transform = self._tf.get("world", "base_link", time_tolerance=1.0) + if robot_transform is None: + logger.warning("Could not get robot transform") + return None + + robot_pos = robot_transform.translation + + # Compute robust target position using front-most points + target_position = self._compute_robust_target_position(detection_3d.pointcloud, robot_pos) + if target_position is None: + logger.warning("Could not compute robust target position") + return None + + return self._compute_twist_from_3d(target_position, robot_transform) + + def _compute_robust_target_position( + self, pointcloud: PointCloud2, robot_pos: Vector3 + ) -> Vector3 | None: + """Compute a robust target position from the detection pointcloud. + + Instead of using the centroid of all points (which includes floor/background), + this method: + 1. Filters out floor points (z < 0.3m in world frame) + 2. Computes distance from robot to each remaining point + 3. Uses the 25th percentile of closest points to get the front surface + 4. Returns the centroid of those front-most points + + Args: + pointcloud: The detection's pointcloud in world frame + robot_pos: Robot's current position in world frame + + Returns: + Vector3 position representing the front of the detected object, + or None if not enough valid points. + """ + points, _ = pointcloud.as_numpy() + if len(points) < 10: + return None + + # Filter out floor points (keep points above 0.3m height) + height_mask = points[:, 2] > 0.3 + points = points[height_mask] + if len(points) < 10: + # Fall back to all points if height filtering removes too many + points, _ = pointcloud.as_numpy() + + # Compute 2D distance (XY plane) from robot to each point + dx = points[:, 0] - robot_pos.x + dy = points[:, 1] - robot_pos.y + distances = np.sqrt(dx * dx + dy * dy) + + # Use 25th percentile of distances to find front-most points + distance_threshold = np.percentile(distances, 25) + + # Get points that are within the front 25% + front_mask = distances <= distance_threshold + front_points = points[front_mask] + + if len(front_points) < 3: + # Fall back to median distance point + median_dist = np.median(distances) + close_mask = np.abs(distances - median_dist) < 0.3 + front_points = points[close_mask] + if len(front_points) < 3: + return None + + # Compute centroid of front-most points + centroid = front_points.mean(axis=0) + return Vector3(centroid[0], centroid[1], centroid[2]) + + def _compute_twist_from_3d(self, target_position: Vector3, robot_transform: Transform) -> Twist: + """Compute twist command to navigate towards a 3D target position. + + Args: + target_position: 3D position of the target in world frame. + robot_transform: Robot's current transform in world frame. + + Returns: + Twist command for the robot. + """ + robot_pos = robot_transform.translation + + # Compute vector from robot to target in world frame + dx = target_position.x - robot_pos.x + dy = target_position.y - robot_pos.y + distance = np.sqrt(dx * dx + dy * dy) + print(f"Distance to target: {distance:.2f} m") + + # Compute angle to target in world frame + angle_to_target = np.arctan2(dy, dx) + + # Get robot's current heading from transform + robot_yaw = robot_transform.rotation.to_euler().z + + # Angle error (how much to turn) + angle_error = angle_to_target - robot_yaw + # Normalize to [-pi, pi] + while angle_error > np.pi: + angle_error -= 2 * np.pi + while angle_error < -np.pi: + angle_error += 2 * np.pi + + # Compute angular velocity (turn towards target) + angular_z = angle_error * self._angular_gain_3d + angular_z = float( + np.clip(angular_z, -self._max_angular_speed_3d, self._max_angular_speed_3d) + ) + + # Compute linear velocity based on distance + distance_error = distance - self._target_distance_3d + + if distance < self._min_distance_3d: + # Too close, back up + linear_x = -self._max_linear_speed_3d * 0.6 + else: + # Move forward based on distance error, reduce speed when turning + turn_factor = 1.0 - min(abs(angle_error) / np.pi, 0.7) + linear_x = distance_error * self._linear_gain_3d * turn_factor + linear_x = float( + np.clip(linear_x, -self._max_linear_speed_3d, self._max_linear_speed_3d) + ) + + return Twist( + linear=Vector3(linear_x, 0.0, 0.0), + angular=Vector3(0.0, 0.0, angular_z), + ) diff --git a/dimos/navigation/visual_servoing/visual_servoing_2d.py b/dimos/navigation/visual_servoing/visual_servoing_2d.py new file mode 100644 index 0000000000..032b5f3370 --- /dev/null +++ b/dimos/navigation/visual_servoing/visual_servoing_2d.py @@ -0,0 +1,166 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from dimos.msgs.geometry_msgs import Twist, Vector3 +from dimos.msgs.sensor_msgs import CameraInfo + + +class VisualServoing2D: + """2D visual servoing controller for tracking objects using bounding boxes. + + Uses camera intrinsics to convert pixel coordinates to normalized camera + coordinates and estimates distance based on known object width. + """ + + # Target distance to maintain from object (meters). + _target_distance: float = 1.5 + + # Minimum distance before backing up (meters). + _min_distance: float = 0.8 + + # Maximum forward/backward speed (m/s). + _max_linear_speed: float = 0.5 + + # Maximum turning speed (rad/s). + _max_angular_speed: float = 0.8 + + # Assumed real-world width of tracked object (meters). + _assumed_object_width: float = 0.45 + + # Proportional gain for angular velocity control. + _angular_gain: float = 1.0 + + # Proportional gain for linear velocity control. + _linear_gain: float = 0.8 + + # Speed factor when backing up (multiplied by max_linear_speed). + _backup_speed_factor: float = 0.6 + + # Multiplier for x_norm when calculating turn factor. + _turn_factor_multiplier: float = 2.0 + + # Maximum speed reduction due to turning (turn_factor ranges from 1-this to 1). + _turn_factor_max_reduction: float = 0.7 + + _rotation_requires_linear_movement: bool = False + + # Camera intrinsics for coordinate conversion. + _camera_info: CameraInfo + + def __init__( + self, camera_info: CameraInfo, rotation_requires_linear_movement: bool = False + ) -> None: + self._camera_info = camera_info + self._rotation_requires_linear_movement = rotation_requires_linear_movement + + def compute_twist( + self, + bbox: tuple[float, float, float, float], + image_width: int, + ) -> Twist: + """Compute twist command to servo towards the tracked object. + + Args: + bbox: Bounding box (x1, y1, x2, y2) in pixels. + image_width: Width of the image. + + Returns: + Twist command for the robot. + """ + x1, _, x2, _ = bbox + bbox_center_x = (x1 + x2) / 2.0 + + # Get normalized x coordinate using inverse K matrix + # Positive = object is to the right of optical center + x_norm = self._get_normalized_x(bbox_center_x) + + estimated_distance = self._estimate_distance(bbox) + + if estimated_distance is None: + return Twist.zero() + + # Calculate distance error (positive = too far, need to move forward) + distance_error = estimated_distance - self._target_distance + + # Compute angular velocity (turn towards object) + # Negative because positive angular.z is counter-clockwise (left turn) + angular_z = -x_norm * self._angular_gain + angular_z = float(np.clip(angular_z, -self._max_angular_speed, self._max_angular_speed)) + + # Compute linear velocity - ALWAYS move forward/backward based on distance. + # Reduce forward speed when turning sharply to maintain stability. + turn_factor = 1.0 - min( + abs(x_norm) * self._turn_factor_multiplier, self._turn_factor_max_reduction + ) + + if estimated_distance < self._min_distance: + # Too close, back up (don't reduce speed for backing up) + linear_x = -self._max_linear_speed * self._backup_speed_factor + else: + # Move forward based on distance error with proportional gain + linear_x = distance_error * self._linear_gain * turn_factor + linear_x = float(np.clip(linear_x, -self._max_linear_speed, self._max_linear_speed)) + + # Enforce minimum linear speed when turning + if self._rotation_requires_linear_movement and abs(angular_z) < 0.02: + linear_x = max(linear_x, 0.1) + + return Twist( + linear=Vector3(linear_x, 0.0, 0.0), + angular=Vector3(0.0, 0.0, angular_z), + ) + + def _get_normalized_x(self, pixel_x: float) -> float: + """Convert pixel x coordinate to normalized camera coordinate. + + Uses inverse K matrix: x_norm = (pixel_x - cx) / fx + + Args: + pixel_x: x coordinate in pixels + + Returns: + Normalized x coordinate (tan of angle from optical center) + """ + fx = self._camera_info.K[0] # focal length x + cx = self._camera_info.K[2] # optical center x + return (pixel_x - cx) / fx + + def _estimate_distance(self, bbox: tuple[float, float, float, float]) -> float | None: + """Estimate distance to object based on bounding box size and camera intrinsics. + + Uses the pinhole camera model: + pixel_width / fx = real_width / distance + distance = (real_width * fx) / pixel_width + + Uses bbox width instead of height because ground robot can't see full + person height when close. Width (shoulders) is more consistently visible. + + Args: + bbox: Bounding box (x1, y1, x2, y2) in pixels. + + Returns: + Estimated distance in meters, or None if bbox is invalid. + """ + bbox_width = bbox[2] - bbox[0] # x2 - x1 + + if bbox_width <= 0: + return None + + # Pinhole camera model: distance = (real_width * fx) / pixel_width + fx = self._camera_info.K[0] # focal length x in pixels + estimated_distance = (self._assumed_object_width * fx) / bbox_width + + return estimated_distance diff --git a/dimos/robot/unitree_webrtc/mujoco_connection.py b/dimos/robot/unitree_webrtc/mujoco_connection.py index a8e5b5e6d4..f31d0599c1 100644 --- a/dimos/robot/unitree_webrtc/mujoco_connection.py +++ b/dimos/robot/unitree_webrtc/mujoco_connection.py @@ -34,10 +34,17 @@ from dimos.core.global_config import GlobalConfig from dimos.msgs.geometry_msgs import Quaternion, Twist, Vector3 -from dimos.msgs.sensor_msgs import Image, ImageFormat +from dimos.msgs.sensor_msgs import CameraInfo, Image, ImageFormat from dimos.robot.unitree_webrtc.type.lidar import LidarMessage from dimos.robot.unitree_webrtc.type.odometry import Odometry -from dimos.simulation.mujoco.constants import LAUNCHER_PATH, LIDAR_FPS, VIDEO_FPS +from dimos.simulation.mujoco.constants import ( + LAUNCHER_PATH, + LIDAR_FPS, + VIDEO_CAMERA_FOV, + VIDEO_FPS, + VIDEO_HEIGHT, + VIDEO_WIDTH, +) from dimos.simulation.mujoco.shared_memory import ShmWriter from dimos.utils.data import get_data from dimos.utils.logging_config import setup_logger @@ -79,6 +86,32 @@ def __init__(self, global_config: GlobalConfig) -> None: self._stop_events: list[threading.Event] = [] self._is_cleaned_up = False + @staticmethod + def _compute_camera_info() -> CameraInfo: + """Compute camera intrinsics from MuJoCo camera parameters. + + Uses pinhole camera model: f = height / (2 * tan(fovy / 2)) + """ + import math + + fovy = math.radians(VIDEO_CAMERA_FOV) + f = VIDEO_HEIGHT / (2 * math.tan(fovy / 2)) + cx = VIDEO_WIDTH / 2.0 + cy = VIDEO_HEIGHT / 2.0 + + return CameraInfo( + frame_id="camera_optical", + height=VIDEO_HEIGHT, + width=VIDEO_WIDTH, + distortion_model="plumb_bob", + D=[0.0, 0.0, 0.0, 0.0, 0.0], + K=[f, 0.0, cx, 0.0, f, cy, 0.0, 0.0, 1.0], + R=[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0], + P=[f, 0.0, cx, 0.0, 0.0, f, cy, 0.0, 0.0, 0.0, 1.0, 0.0], + ) + + camera_info_static: CameraInfo = _compute_camera_info() + def start(self) -> None: self.shm_data = ShmWriter() diff --git a/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py b/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py index be53702412..8a017ad004 100644 --- a/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py +++ b/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py @@ -27,6 +27,7 @@ from dimos.agents.cli.web import web_input from dimos.agents.ollama_agent import ollama_installed from dimos.agents.skills.navigation import navigation_skill +from dimos.agents.skills.person_follow import person_follow_skill from dimos.agents.skills.speak_skill import speak_skill from dimos.agents.spec import Provider from dimos.agents.vlm_agent import vlm_agent @@ -142,7 +143,13 @@ spatial = autoconnect( nav, - spatial_memory(), + # TODO: Turn back on. Turned off to stop logging. + # EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE + # EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE + # EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE + # EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE + # EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE + # spatial_memory(), utilization(), ).global_config(n_dask_workers=8) @@ -168,6 +175,7 @@ _common_agentic = autoconnect( human_input(), navigation_skill(), + person_follow_skill(camera_info=GO2Connection.camera_info_static), unitree_skills(), web_input(), speak_skill(), diff --git a/dimos/simulation/mujoco/constants.py b/dimos/simulation/mujoco/constants.py index aca916a372..4e35011530 100644 --- a/dimos/simulation/mujoco/constants.py +++ b/dimos/simulation/mujoco/constants.py @@ -17,6 +17,7 @@ # Video/Camera constants VIDEO_WIDTH = 320 VIDEO_HEIGHT = 240 +VIDEO_CAMERA_FOV = 45 # MuJoCo default FOV for head_camera (degrees) DEPTH_CAMERA_FOV = 160 # Depth camera range/filtering constants diff --git a/dimos/simulation/mujoco/model.py b/dimos/simulation/mujoco/model.py index de533521da..bc309b7307 100644 --- a/dimos/simulation/mujoco/model.py +++ b/dimos/simulation/mujoco/model.py @@ -37,14 +37,21 @@ def _get_data_dir() -> epath.Path: def get_assets() -> dict[str, bytes]: data_dir = _get_data_dir() + assets: dict[str, bytes] = {} + # Assets used from https://sketchfab.com/3d-models/mersus-office-8714be387bcd406898b2615f7dae3a47 # Created by Ryan Cassidy and Coleman Costello - assets: dict[str, bytes] = {} mjx_env.update_assets(assets, data_dir, "*.xml") mjx_env.update_assets(assets, data_dir / "scene_office1/textures", "*.png") mjx_env.update_assets(assets, data_dir / "scene_office1/office_split", "*.obj") mjx_env.update_assets(assets, mjx_env.MENAGERIE_PATH / "unitree_go1" / "assets") mjx_env.update_assets(assets, mjx_env.MENAGERIE_PATH / "unitree_g1" / "assets") + + # From: https://sketchfab.com/3d-models/jeong-seun-34-42956ca979404a038b8e0d3e496160fd + person_dir = epath.Path(str(get_data("person"))) + mjx_env.update_assets(assets, person_dir, "*.obj") + mjx_env.update_assets(assets, person_dir, "*.png") + return assets @@ -106,9 +113,38 @@ def get_model_xml(robot: str, scene_xml: str) -> str: map_elem.set("znear", "0.01") map_elem.set("zfar", "10000") + _add_person_object(root) + return ET.tostring(root, encoding="unicode") +def _add_person_object(root: ET.Element) -> None: + asset = root.find("asset") + + if asset is None: + asset = ET.SubElement(root, "asset") + + ET.SubElement(asset, "mesh", name="person_mesh", file="jeong_seun_34.obj") + ET.SubElement(asset, "texture", name="person_texture", file="material_0.png", type="2d") + ET.SubElement(asset, "material", name="person_material", texture="person_texture") + + worldbody = root.find("worldbody") + + if worldbody is None: + worldbody = ET.SubElement(root, "worldbody") + + person_body = ET.SubElement(worldbody, "body", name="person", pos="0 0 0", mocap="true") + + ET.SubElement( + person_body, + "geom", + type="mesh", + mesh="person_mesh", + material="person_material", + euler="1.5708 0 0", + ) + + def load_scene_xml(config: GlobalConfig) -> str: if config.mujoco_room_from_occupancy: path = Path(config.mujoco_room_from_occupancy) diff --git a/dimos/simulation/mujoco/mujoco_process.py b/dimos/simulation/mujoco/mujoco_process.py index 2363a8abd3..4bfcd6deda 100755 --- a/dimos/simulation/mujoco/mujoco_process.py +++ b/dimos/simulation/mujoco/mujoco_process.py @@ -41,6 +41,7 @@ ) from dimos.simulation.mujoco.depth_camera import depth_image_to_point_cloud from dimos.simulation.mujoco.model import load_model, load_scene_xml +from dimos.simulation.mujoco.person_on_track import PersonPositionController from dimos.simulation.mujoco.shared_memory import ShmReader from dimos.utils.logging_config import setup_logger @@ -97,6 +98,9 @@ def _run_simulation(config: GlobalConfig, shm: ShmReader) -> None: camera_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_CAMERA, "head_camera") lidar_camera_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_CAMERA, "lidar_front_camera") + + person_position_controller = PersonPositionController(model) + lidar_left_camera_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_CAMERA, "lidar_left_camera") lidar_right_camera_id = mujoco.mj_name2id( model, mujoco.mjtObj.mjOBJ_CAMERA, "lidar_right_camera" @@ -138,6 +142,8 @@ def _run_simulation(config: GlobalConfig, shm: ShmReader) -> None: for _ in range(config.mujoco_steps_per_frame): mujoco.mj_step(model, data) + person_position_controller.tick(data) + m_viewer.sync() # Always update odometry @@ -220,6 +226,8 @@ def _run_simulation(config: GlobalConfig, shm: ShmReader) -> None: if time_until_next_step > 0: time.sleep(time_until_next_step) + person_position_controller.stop() + if __name__ == "__main__": diff --git a/dimos/simulation/mujoco/person_on_track.py b/dimos/simulation/mujoco/person_on_track.py new file mode 100644 index 0000000000..a816b5f3ee --- /dev/null +++ b/dimos/simulation/mujoco/person_on_track.py @@ -0,0 +1,160 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any + +import mujoco +import numpy as np +from numpy.typing import NDArray + +from dimos.core.transport import LCMTransport +from dimos.msgs.geometry_msgs import Pose + + +class PersonPositionController: + """Controls the person position in MuJoCo by subscribing to LCM pose updates.""" + + def __init__(self, model: mujoco.MjModel) -> None: + person_body_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_BODY, "person") + self._person_mocap_id = model.body_mocapid[person_body_id] + self._latest_pose: Pose | None = None + self._transport: LCMTransport[Pose] = LCMTransport("/person_pose", Pose) + self._transport.subscribe(self._on_pose) + + def _on_pose(self, pose: Pose) -> None: + self._latest_pose = pose + + def tick(self, data: mujoco.MjData) -> None: + if self._latest_pose is None: + return + + pose = self._latest_pose + data.mocap_pos[self._person_mocap_id][0] = pose.position.x + data.mocap_pos[self._person_mocap_id][1] = pose.position.y + data.mocap_pos[self._person_mocap_id][2] = pose.position.z + data.mocap_quat[self._person_mocap_id] = [ + pose.orientation.w, + pose.orientation.x, + pose.orientation.y, + pose.orientation.z, + ] + + def stop(self) -> None: + self._transport.stop() + + +class PersonTrackPublisher: + """Publishes person poses along a track via LCM.""" + + def __init__(self, track: list[tuple[float, float]]) -> None: + self._speed = 0.004 + self._waypoint_threshold = 0.1 + self._rotation_radius = 1.0 + self._track = track + self._current_waypoint_idx = 0 + self._initialized = False + self._current_pos = np.array([0.0, 0.0]) + self._transport: LCMTransport[Pose] = LCMTransport("/person_pose", Pose) + + def _get_segment_heading(self, from_idx: int, to_idx: int) -> float: + """Get heading angle for traveling from one waypoint to another.""" + from_wp = np.array(self._track[from_idx]) + to_wp = np.array(self._track[to_idx]) + direction = to_wp - from_wp + return float(np.arctan2(direction[1], direction[0])) + + def _lerp_angle(self, a1: float, a2: float, t: float) -> float: + """Interpolate between two angles, handling wrapping.""" + diff = a2 - a1 + while diff > np.pi: + diff -= 2 * np.pi + while diff < -np.pi: + diff += 2 * np.pi + return a1 + diff * t + + def tick(self) -> None: + if not self._initialized: + first_point = self._track[0] + self._current_pos = np.array([first_point[0], first_point[1]]) + self._current_waypoint_idx = 1 + heading = self._get_segment_heading(0, 1) + self._publish_pose(self._current_pos, heading) + self._initialized = True + return + + n = len(self._track) + + prev_idx = (self._current_waypoint_idx - 1) % n + curr_idx = self._current_waypoint_idx + next_idx = (self._current_waypoint_idx + 1) % n + prev_prev_idx = (prev_idx - 1) % n + + prev_wp = np.array(self._track[prev_idx]) + curr_wp = np.array(self._track[curr_idx]) + + to_target = curr_wp - self._current_pos + distance_to_curr = float(np.linalg.norm(to_target)) + distance_from_prev = float(np.linalg.norm(self._current_pos - prev_wp)) + + # Headings for current turn (at curr_wp) + incoming_heading = self._get_segment_heading(prev_idx, curr_idx) + outgoing_heading = self._get_segment_heading(curr_idx, next_idx) + + # Headings for previous turn (at prev_wp) + prev_incoming_heading = self._get_segment_heading(prev_prev_idx, prev_idx) + prev_outgoing_heading = incoming_heading + + # Determine heading based on position in rotation zones + in_leaving_zone = distance_from_prev < self._rotation_radius + in_approaching_zone = distance_to_curr < self._rotation_radius + + if in_leaving_zone and in_approaching_zone: + # Overlap - prioritize approaching zone + t = 0.5 * (1.0 - distance_to_curr / self._rotation_radius) + heading = self._lerp_angle(incoming_heading, outgoing_heading, t) + elif in_leaving_zone: + # Finishing turn after passing prev_wp (t goes from 0.5 to 1.0) + t = 0.5 + 0.5 * (distance_from_prev / self._rotation_radius) + heading = self._lerp_angle(prev_incoming_heading, prev_outgoing_heading, t) + elif in_approaching_zone: + # Starting turn before reaching curr_wp (t goes from 0.0 to 0.5) + t = 0.5 * (1.0 - distance_to_curr / self._rotation_radius) + heading = self._lerp_angle(incoming_heading, outgoing_heading, t) + else: + # Between zones, use segment heading + heading = incoming_heading + + # Move toward target + if distance_to_curr > 0: + dir_norm = to_target / distance_to_curr + self._current_pos[0] += dir_norm[0] * self._speed + self._current_pos[1] += dir_norm[1] * self._speed + + # Check if reached waypoint + if distance_to_curr < self._waypoint_threshold: + self._current_waypoint_idx = next_idx + + # Publish pose + self._publish_pose(self._current_pos, heading + np.pi) + + def _publish_pose(self, pos: NDArray[np.floating[Any]], heading: float) -> None: + c, s = np.cos(heading / 2), np.sin(heading / 2) + pose = Pose( + position=[pos[0], pos[1], 0.0], + orientation=[0.0, 0.0, s, c], # x, y, z, w + ) + self._transport.broadcast(None, pose) + + def stop(self) -> None: + self._transport.stop() diff --git a/pyproject.toml b/pyproject.toml index 65e586b7ac..9f6f4bc417 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,10 @@ misc = [ # Perception Dependencies "scikit-learn", "timm>=1.0.15", + "edgetam-dimos", "opencv-contrib-python==4.10.0.84", + "omegaconf>=2.3.0", + "hydra-core>=1.3.0", # embedding models "open_clip_torch==3.2.0", diff --git a/uv.lock b/uv.lock index 700d8fbfd6..5274ec5c08 100644 --- a/uv.lock +++ b/uv.lock @@ -134,6 +134,12 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/60/1c/1cd02b7ae64302a6e06724bf80a96401d5313708651d277b1458504a1730/anthropic-0.75.0-py3-none-any.whl", hash = "sha256:ea8317271b6c15d80225a9f3c670152746e88805a7a61e14d4a374577164965b", size = 388164, upload-time = "2025-11-24T20:41:43.587Z" }, ] +[[package]] +name = "antlr4-python3-runtime" +version = "4.9.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/3e/38/7859ff46355f76f8d19459005ca000b6e7012f2f1ca597746cbcd1fbfe5e/antlr4-python3-runtime-4.9.3.tar.gz", hash = "sha256:f224469b4168294902bb1efa80a8bf7855f24c99aef99cbefc1bcd3cce77881b", size = 117034, upload-time = "2021-11-06T17:52:23.524Z" } + [[package]] name = "anyio" version = "4.12.1" @@ -1520,12 +1526,15 @@ manipulation = [ misc = [ { name = "catkin-pkg" }, { name = "cerebras-cloud-sdk" }, + { name = "edgetam-dimos" }, { name = "einops" }, { name = "empy" }, { name = "gdown" }, { name = "googlemaps" }, + { name = "hydra-core" }, { name = "ipykernel" }, { name = "lark" }, + { name = "omegaconf" }, { name = "onnx" }, { name = "open-clip-torch" }, { name = "opencv-contrib-python" }, @@ -1615,6 +1624,7 @@ requires-dist = [ { name = "dimos", extras = ["agents", "web", "perception", "visualization", "sim"], marker = "extra == 'base'" }, { name = "dimos", extras = ["base"], marker = "extra == 'unitree'" }, { name = "dimos-lcm" }, + { name = "edgetam-dimos", marker = "extra == 'misc'" }, { name = "einops", marker = "extra == 'misc'", specifier = "==0.8.1" }, { name = "empy", marker = "extra == 'misc'", specifier = "==3.3.4" }, { name = "fastapi", marker = "extra == 'web'", specifier = ">=0.115.6" }, @@ -1623,6 +1633,7 @@ requires-dist = [ { name = "gdown", marker = "extra == 'misc'", specifier = "==5.2.0" }, { name = "googlemaps", marker = "extra == 'misc'", specifier = ">=4.10.0" }, { name = "h5py", marker = "extra == 'manipulation'", specifier = ">=3.7.0" }, + { name = "hydra-core", marker = "extra == 'misc'", specifier = ">=1.3.0" }, { name = "ipykernel", marker = "extra == 'misc'" }, { name = "kaleido", marker = "extra == 'manipulation'", specifier = ">=0.2.1" }, { name = "langchain", marker = "extra == 'agents'", specifier = ">=1,<2" }, @@ -1647,6 +1658,7 @@ requires-dist = [ { name = "numpy", specifier = ">=1.26.4" }, { name = "nvidia-nvimgcodec-cu12", extras = ["all"], marker = "extra == 'cuda'" }, { name = "ollama", marker = "extra == 'agents'", specifier = ">=0.6.0" }, + { name = "omegaconf", marker = "extra == 'misc'", specifier = ">=2.3.0" }, { name = "onnx", marker = "extra == 'misc'" }, { name = "onnxruntime", marker = "extra == 'cpu'" }, { name = "onnxruntime-gpu", marker = "extra == 'cuda'", specifier = ">=1.17.1" }, @@ -1829,6 +1841,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b0/0d/9feae160378a3553fa9a339b0e9c1a048e147a4127210e286ef18b730f03/durationpy-0.10-py3-none-any.whl", hash = "sha256:3b41e1b601234296b4fb368338fdcd3e13e0b4fb5b67345948f4f2bf9868b286", size = 3922, upload-time = "2025-05-17T13:52:36.463Z" }, ] +[[package]] +name = "edgetam-dimos" +version = "1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "hydra-core" }, + { name = "iopath" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "pillow" }, + { name = "torch" }, + { name = "torchvision" }, + { name = "tqdm" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7a/ea/bec55e18e19b6e43ed5f18bfcb699933ab82744fa8a52209ac6e94a6d6d8/edgetam_dimos-1.0.tar.gz", hash = "sha256:4fea5fd5a5aa17f9145dc4f35abc41de9426acaa0d59cae9b467cf26e657d4a7", size = 74935, upload-time = "2026-01-19T22:53:39.159Z" } + [[package]] name = "einops" version = "0.8.1" @@ -2689,6 +2717,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c5/7b/bca5613a0c3b542420cf92bd5e5fb8ebd5435ce1011a091f66bb7693285e/humanize-4.15.0-py3-none-any.whl", hash = "sha256:b1186eb9f5a9749cd9cb8565aee77919dd7c8d076161cf44d70e59e3301e1769", size = 132203, upload-time = "2025-12-20T20:16:11.67Z" }, ] +[[package]] +name = "hydra-core" +version = "1.3.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "antlr4-python3-runtime" }, + { name = "omegaconf" }, + { name = "packaging" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6d/8e/07e42bc434a847154083b315779b0a81d567154504624e181caf2c71cd98/hydra-core-1.3.2.tar.gz", hash = "sha256:8a878ed67216997c3e9d88a8e72e7b4767e81af37afb4ea3334b269a4390a824", size = 3263494, upload-time = "2023-02-23T18:33:43.03Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/50/e0edd38dcd63fb26a8547f13d28f7a008bc4a3fd4eb4ff030673f22ad41a/hydra_core-1.3.2-py3-none-any.whl", hash = "sha256:fa0238a9e31df3373b35b0bfb672c34cc92718d21f81311d8996a16de1141d8b", size = 154547, upload-time = "2023-02-23T18:33:40.801Z" }, +] + [[package]] name = "identify" version = "2.6.15" @@ -2760,6 +2802,17 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, ] +[[package]] +name = "iopath" +version = "0.1.10" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "portalocker" }, + { name = "tqdm" }, + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/73/b3d451dfc523756cf177d3ebb0af76dc7751b341c60e2a21871be400ae29/iopath-0.1.10.tar.gz", hash = "sha256:3311c16a4d9137223e20f141655759933e1eda24f8bff166af834af3c645ef01", size = 42226, upload-time = "2022-07-09T19:00:50.866Z" } + [[package]] name = "ipykernel" version = "7.1.0" @@ -5210,6 +5263,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/47/4f/4a617ee93d8208d2bcf26b2d8b9402ceaed03e3853c754940e2290fed063/ollama-0.6.1-py3-none-any.whl", hash = "sha256:fc4c984b345735c5486faeee67d8a265214a31cbb828167782dc642ce0a2bf8c", size = 14354, upload-time = "2025-11-13T23:02:16.292Z" }, ] +[[package]] +name = "omegaconf" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "antlr4-python3-runtime" }, + { name = "pyyaml" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/09/48/6388f1bb9da707110532cb70ec4d2822858ddfb44f1cdf1233c20a80ea4b/omegaconf-2.3.0.tar.gz", hash = "sha256:d5d4b6d29955cc50ad50c46dc269bcd92c6e00f5f90d23ab5fee7bfca4ba4cc7", size = 3298120, upload-time = "2022-12-08T20:59:22.753Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/94/1843518e420fa3ed6919835845df698c7e27e183cb997394e4a670973a65/omegaconf-2.3.0-py3-none-any.whl", hash = "sha256:7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b", size = 79500, upload-time = "2022-12-08T20:59:19.686Z" }, +] + [[package]] name = "onnx" version = "1.20.0" @@ -6023,6 +6089,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d8/a12e6aa14f63784cead437083319ec7cece0d5bb9a5bfe7678cc6578b52a/polars_runtime_32-1.36.1-cp39-abi3-win_arm64.whl", hash = "sha256:809e73857be71250141225ddd5d2b30c97e6340aeaa0d445f930e01bef6888dc", size = 39798896, upload-time = "2025-12-10T01:14:11.568Z" }, ] +[[package]] +name = "portalocker" +version = "3.2.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pywin32", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/5e/77/65b857a69ed876e1951e88aaba60f5ce6120c33703f7cb61a3c894b8c1b6/portalocker-3.2.0.tar.gz", hash = "sha256:1f3002956a54a8c3730586c5c77bf18fae4149e07eaf1c29fc3faf4d5a3f89ac", size = 95644, upload-time = "2025-06-14T13:20:40.03Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4b/a6/38c8e2f318bf67d338f4d629e93b0b4b9af331f455f0390ea8ce4a099b26/portalocker-3.2.0-py3-none-any.whl", hash = "sha256:3cdc5f565312224bc570c49337bd21428bba0ef363bbcf58b9ef4a9f11779968", size = 22424, upload-time = "2025-06-14T13:20:38.083Z" }, +] + [[package]] name = "posthog" version = "5.4.0"