diff --git a/data/.lfs/person.tar.gz b/data/.lfs/person.tar.gz
new file mode 100644
index 0000000000..1f32d0db58
--- /dev/null
+++ b/data/.lfs/person.tar.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:332c3196c6436e7d4c2b7e3314b4a4055865ef358b2e9cf3c8ddd7e173f39b93
+size 2535758
diff --git a/dimos/agents/skills/person_follow.py b/dimos/agents/skills/person_follow.py
new file mode 100644
index 0000000000..0d4420632c
--- /dev/null
+++ b/dimos/agents/skills/person_follow.py
@@ -0,0 +1,248 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from threading import Event, RLock
+import time
+from typing import TYPE_CHECKING
+
+import numpy as np
+from reactivex.disposable import Disposable
+
+from dimos.core.core import rpc
+from dimos.core.global_config import GlobalConfig
+from dimos.core.skill_module import SkillModule
+from dimos.core.stream import In, Out
+from dimos.models.qwen.video_query import BBox
+from dimos.models.segmentation.edge_tam import EdgeTAMProcessor
+from dimos.models.vl.qwen import QwenVlModel
+from dimos.msgs.geometry_msgs import Twist
+from dimos.msgs.sensor_msgs import CameraInfo, Image, PointCloud2
+from dimos.navigation.visual.query import get_object_bbox_from_image
+from dimos.navigation.visual_servoing.detection_navigation import DetectionNavigation
+from dimos.navigation.visual_servoing.visual_servoing_2d import VisualServoing2D
+from dimos.protocol.skill.skill import skill
+from dimos.utils.logging_config import setup_logger
+
+if TYPE_CHECKING:
+    from dimos.models.vl.base import VlModel
+
+logger = setup_logger()
+
+
+class PersonFollowSkillContainer(SkillModule):
+    """Skill container for following a person.
+
+    This skill uses:
+    - A VL model (QwenVlModel) to initially detect a person from a text description.
+    - EdgeTAM for continuous tracking across frames.
+    - Visual servoing OR 3D navigation to control robot movement towards the person.
+    - Does not do obstacle avoidance; assumes a clear path.
+    """
+
+    color_image: In[Image]
+    global_map: In[PointCloud2]
+    cmd_vel: Out[Twist]
+
+    _frequency: float = 20.0  # Hz - control loop frequency
+    _max_lost_frames: int = 15  # number of frames to wait before declaring person lost
+
+    def __init__(
+        self,
+        camera_info: CameraInfo,
+        global_config: GlobalConfig,
+        use_3d_navigation: bool = False,
+    ) -> None:
+        super().__init__()
+        self._global_config: GlobalConfig = global_config
+        self._use_3d_navigation: bool = use_3d_navigation
+        self._latest_image: Image | None = None
+        self._latest_pointcloud: PointCloud2 | None = None
+        self._vl_model: VlModel = QwenVlModel()
+        self._tracker: EdgeTAMProcessor | None = None
+        self._should_stop: Event = Event()
+        self._lock = RLock()
+
+        # Use MuJoCo camera intrinsics in simulation mode
+        if self._global_config.simulation:
+            from dimos.robot.unitree_webrtc.mujoco_connection import MujocoConnection
+
+            camera_info = MujocoConnection.camera_info_static
+
+        self._camera_info = camera_info
+        self._visual_servo = VisualServoing2D(camera_info, self._global_config.simulation)
+        self._detection_navigation = DetectionNavigation(self.tf, camera_info)
+
+    @rpc
+    def start(self) -> None:
+        super().start()
+        self._disposables.add(Disposable(self.color_image.subscribe(self._on_color_image)))
+        if self._use_3d_navigation:
+            self._disposables.add(Disposable(self.global_map.subscribe(self._on_pointcloud)))
+
+    @rpc
+    def stop(self) -> None:
+        self._stop_following()
+
+        with self._lock:
+            if self._tracker is not None:
+                self._tracker.stop()
+                self._tracker = None
+
+        self._vl_model.stop()
+        super().stop()
+
+    @skill()
+    def follow_person(self, query: str) -> str:
+        """Follow a person matching the given description using visual servoing.
+
+        The robot will continuously track and follow the person, while keeping
+        them centered in the camera view.
+
+        Args:
+            query: Description of the person to follow (e.g., "man with blue shirt")
+
+        Returns:
+            Status message indicating the result of the following action.
+
+        Example:
+            follow_person("man with blue shirt")
+            follow_person("person in the doorway")
+        """
+
+        self._stop_following()
+
+        self._should_stop.clear()
+
+        with self._lock:
+            latest_image = self._latest_image
+
+        if latest_image is None:
+            return "No image available to detect person."
+
+        initial_bbox = get_object_bbox_from_image(
+            self._vl_model,
+            latest_image,
+            query,
+        )
+
+        if initial_bbox is None:
+            return f"Could not find '{query}' in the current view."
+
+        return self._follow_loop(query, initial_bbox)
+
+    @skill()
+    def stop_following(self) -> str:
+        """Stop following the current person.
+
+        Returns:
+            Confirmation message.
+        """
+        self._stop_following()
+
+        self.cmd_vel.publish(Twist.zero())
+
+        return "Stopped following."
+
+    def _on_color_image(self, image: Image) -> None:
+        with self._lock:
+            self._latest_image = image
+
+    def _on_pointcloud(self, pointcloud: PointCloud2) -> None:
+        with self._lock:
+            self._latest_pointcloud = pointcloud
+
+    def _follow_loop(self, query: str, initial_bbox: BBox) -> str:
+        x1, y1, x2, y2 = initial_bbox
+        box = np.array([x1, y1, x2, y2], dtype=np.float32)
+
+        with self._lock:
+            if self._tracker is None:
+                self._tracker = EdgeTAMProcessor()
+            tracker = self._tracker
+            latest_image = self._latest_image
+            if latest_image is None:
+                return "No image available to start tracking."
+
+        initial_detections = tracker.init_track(
+            image=latest_image,
+            box=box,
+            obj_id=1,
+        )
+
+        if len(initial_detections) == 0:
+            self.cmd_vel.publish(Twist.zero())
+            return f"EdgeTAM failed to segment '{query}'."
+
+        logger.info(f"EdgeTAM initialized with {len(initial_detections)} detections")
+
+        lost_count = 0
+        period = 1.0 / self._frequency
+        next_time = time.monotonic()
+
+        while not self._should_stop.is_set():
+            next_time += period
+
+            with self._lock:
+                latest_image = self._latest_image
+                assert latest_image is not None
+
+            detections = tracker.process_image(latest_image)
+
+            if len(detections) == 0:
+                self.cmd_vel.publish(Twist.zero())
+
+                lost_count += 1
+                if lost_count > self._max_lost_frames:
+                    self.cmd_vel.publish(Twist.zero())
+                    return f"Lost track of '{query}'. Stopping."
+            else:
+                lost_count = 0
+                best_detection = max(detections.detections, key=lambda d: d.bbox_2d_volume())
+
+                if self._use_3d_navigation:
+                    with self._lock:
+                        pointcloud = self._latest_pointcloud
+                    if pointcloud is None:
+                        self.cmd_vel.publish(Twist.zero())
+                        return "No pointcloud available for 3D navigation. Stopping."
+                    twist = self._detection_navigation.compute_twist_for_detection_3d(
+                        pointcloud,
+                        best_detection,
+                        latest_image,
+                    )
+                    if twist is None:
+                        self.cmd_vel.publish(Twist.zero())
+                        return f"3D navigation failed for '{query}'. Stopping."
+                else:
+                    twist = self._visual_servo.compute_twist(
+                        best_detection.bbox,
+                        latest_image.width,
+                    )
+                self.cmd_vel.publish(twist)
+
+            now = time.monotonic()
+            sleep_duration = next_time - now
+            if sleep_duration > 0:
+                time.sleep(sleep_duration)
+
+        self.cmd_vel.publish(Twist.zero())
+        return "Stopped following as requested."
+
+    def _stop_following(self) -> None:
+        self._should_stop.set()
+
+
+person_follow_skill = PersonFollowSkillContainer.blueprint
+
+__all__ = ["PersonFollowSkillContainer", "person_follow_skill"]
diff --git a/dimos/e2e_tests/conftest.py b/dimos/e2e_tests/conftest.py
index 12d3e407ae..46b92151e9 100644
--- a/dimos/e2e_tests/conftest.py
+++ b/dimos/e2e_tests/conftest.py
@@ -63,8 +63,8 @@ def fun(*, points: list[tuple[float, float, float]], fail_message: str) -> None:
 def start_blueprint() -> Iterator[Callable[[str], DimosCliCall]]:
     dimos_robot_call = DimosCliCall()
 
-    def set_name_and_start(demo_name: str) -> DimosCliCall:
-        dimos_robot_call.demo_name = demo_name
+    def set_name_and_start(*demo_args: str) -> DimosCliCall:
+        dimos_robot_call.demo_args = list(demo_args)
         dimos_robot_call.start()
         return dimos_robot_call
 
diff --git a/dimos/e2e_tests/dimos_cli_call.py b/dimos/e2e_tests/dimos_cli_call.py
index 07def58782..956b30a086 100644
--- a/dimos/e2e_tests/dimos_cli_call.py
+++ b/dimos/e2e_tests/dimos_cli_call.py
@@ -19,18 +19,16 @@
 
 class DimosCliCall:
     process: subprocess.Popen[bytes] | None
-    demo_name: str | None = None
+    demo_args: list[str] | None = None
 
     def __init__(self) -> None:
         self.process = None
 
     def start(self) -> None:
-        if self.demo_name is None:
-            raise ValueError("Demo name must be set before starting the process.")
+        if self.demo_args is None:
+            raise ValueError("Demo args must be set before starting the process.")
 
-        self.process = subprocess.Popen(
-            ["dimos", "--simulation", "run", self.demo_name],
-        )
+        self.process = subprocess.Popen(["dimos", "--simulation", *self.demo_args])
 
     def stop(self) -> None:
         if self.process is None:
diff --git a/dimos/e2e_tests/test_dimos_cli_e2e.py b/dimos/e2e_tests/test_dimos_cli_e2e.py
index 7571e113ad..0c2552905c 100644
--- a/dimos/e2e_tests/test_dimos_cli_e2e.py
+++ b/dimos/e2e_tests/test_dimos_cli_e2e.py
@@ -26,7 +26,7 @@ def test_dimos_skills(lcm_spy, start_blueprint, human_input) -> None:
     lcm_spy.save_topic("/rpc/DemoCalculatorSkill/sum_numbers/req")
     lcm_spy.save_topic("/rpc/DemoCalculatorSkill/sum_numbers/res")
 
-    start_blueprint("demo-skill")
+    start_blueprint("run", "demo-skill")
 
     lcm_spy.wait_for_saved_topic("/rpc/DemoCalculatorSkill/set_AgentSpec_register_skills/res")
     lcm_spy.wait_for_saved_topic("/rpc/HumanInput/start/res")
diff --git a/dimos/e2e_tests/test_person_follow.py b/dimos/e2e_tests/test_person_follow.py
new file mode 100644
index 0000000000..663dee420e
--- /dev/null
+++ b/dimos/e2e_tests/test_person_follow.py
@@ -0,0 +1,85 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Callable, Generator
+import os
+import threading
+import time
+
+import pytest
+
+from dimos.e2e_tests.dimos_cli_call import DimosCliCall
+from dimos.e2e_tests.lcm_spy import LcmSpy
+from dimos.simulation.mujoco.person_on_track import PersonTrackPublisher
+
+StartPersonTrack = Callable[[list[tuple[float, float]]], None]
+
+
+@pytest.fixture
+def start_person_track() -> Generator[StartPersonTrack, None, None]:
+    publisher: PersonTrackPublisher | None = None
+    stop_event = threading.Event()
+    thread: threading.Thread | None = None
+
+    def start(track: list[tuple[float, float]]) -> None:
+        nonlocal publisher, thread
+        publisher = PersonTrackPublisher(track)
+
+        def run_person_track() -> None:
+            while not stop_event.is_set():
+                publisher.tick()
+                time.sleep(1 / 60)
+
+        thread = threading.Thread(target=run_person_track, daemon=True)
+        thread.start()
+
+    yield start
+
+    stop_event.set()
+    if thread is not None:
+        thread.join(timeout=1.0)
+    if publisher is not None:
+        publisher.stop()
+
+
+@pytest.mark.skipif(bool(os.getenv("CI")), reason="LCM spy doesn't work in CI.")
+@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="OPENAI_API_KEY not set.")
+@pytest.mark.e2e
+def test_person_follow(
+    lcm_spy: LcmSpy,
+    start_blueprint: Callable[[str], DimosCliCall],
+    human_input: Callable[[str], None],
+    start_person_track: StartPersonTrack,
+) -> None:
+    start_blueprint("--mujoco-start-pos", "-6.18 0.96", "run", "unitree-go2-agentic")
+
+    lcm_spy.save_topic("/rpc/HumanInput/start/res")
+    lcm_spy.wait_for_saved_topic("/rpc/HumanInput/start/res", timeout=120.0)
+    lcm_spy.save_topic("/agent")
+    lcm_spy.wait_for_saved_topic_content("/agent", b"AIMessage", timeout=120.0)
+
+    time.sleep(5)
+
+    start_person_track(
+        [
+            (-2.60, 1.28),
+            (4.80, 0.21),
+            (4.14, -6.0),
+            (0.59, -3.79),
+            (-3.35, -0.51),
+        ]
+    )
+    human_input("follow the person in beige pants")
+
+    lcm_spy.wait_until_odom_position(4.2, -3, threshold=1.5)
diff --git a/dimos/e2e_tests/test_spatial_memory.py b/dimos/e2e_tests/test_spatial_memory.py
index 5029f46525..171f638f19 100644
--- a/dimos/e2e_tests/test_spatial_memory.py
+++ b/dimos/e2e_tests/test_spatial_memory.py
@@ -32,7 +32,7 @@ def test_spatial_memory_navigation(
     human_input: Callable[[str], None],
     follow_points: Callable[..., None],
 ) -> None:
-    start_blueprint("unitree-go2-agentic")
+    start_blueprint("run", "unitree-go2-agentic")
 
     lcm_spy.save_topic("/rpc/HumanInput/start/res")
     lcm_spy.wait_for_saved_topic("/rpc/HumanInput/start/res", timeout=120.0)
diff --git a/dimos/models/segmentation/configs/edgetam.yaml b/dimos/models/segmentation/configs/edgetam.yaml
new file mode 100644
index 0000000000..6fe21c99df
--- /dev/null
+++ b/dimos/models/segmentation/configs/edgetam.yaml
@@ -0,0 +1,138 @@
+# @package _global_
+
+# Model
+model:
+  _target_: sam2.sam2_video_predictor.SAM2VideoPredictor
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.timm.TimmBackbone
+      name: repvit_m1.dist_in1k
+      features:
+      - layer0
+      - layer1
+      - layer2
+      - layer3
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [384, 192, 96, 48]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [32, 32]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttentionv2
+        rope_theta: 10000.0
+        q_sizes: [64, 64]
+        k_sizes: [16, 16]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 2
+
+  memory_encoder:
+    _target_: sam2.modeling.memory_encoder.MemoryEncoder
+    out_dim: 64
+    position_encoding:
+      _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+      num_pos_feats: 64
+      normalize: true
+      scale: null
+      temperature: 10000
+    mask_downsampler:
+      _target_: sam2.modeling.memory_encoder.MaskDownSampler
+      kernel_size: 3
+      stride: 2
+      padding: 1
+    fuser:
+      _target_: sam2.modeling.memory_encoder.Fuser
+      layer:
+        _target_: sam2.modeling.memory_encoder.CXBlock
+        dim: 256
+        kernel_size: 7
+        padding: 3
+        layer_scale_init_value: 1e-6
+        use_dwconv: True  # depth-wise convs
+      num_layers: 2
+
+  spatial_perceiver:
+    _target_: sam2.modeling.perceiver.PerceiverResampler
+    depth: 2
+    dim: 64
+    dim_head: 64
+    heads: 1
+    ff_mult: 4
+    hidden_dropout_p: 0.
+    attention_dropout_p: 0.
+    pos_enc_at_key_value: true # implicit pos
+    concat_kv_latents: false
+    num_latents: 256
+    num_latents_2d: 256
+    position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+    use_self_attn: true
+
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  compile_image_encoder: false
diff --git a/dimos/models/segmentation/edge_tam.py b/dimos/models/segmentation/edge_tam.py
new file mode 100644
index 0000000000..ba351be130
--- /dev/null
+++ b/dimos/models/segmentation/edge_tam.py
@@ -0,0 +1,269 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections.abc import Generator
+from contextlib import contextmanager
+import os
+from pathlib import Path
+import shutil
+import tempfile
+from typing import TYPE_CHECKING, Any, TypedDict
+
+import cv2
+from hydra.utils import instantiate  # type: ignore[import-not-found]
+import numpy as np
+from numpy.typing import NDArray
+from omegaconf import OmegaConf  # type: ignore[import-not-found]
+from PIL import Image as PILImage
+import torch
+
+from dimos.msgs.sensor_msgs import Image
+from dimos.perception.detection.detectors.types import Detector
+from dimos.perception.detection.type import ImageDetections2D
+from dimos.perception.detection.type.detection2d.seg import Detection2DSeg
+from dimos.utils.data import get_data
+from dimos.utils.logging_config import setup_logger
+
+if TYPE_CHECKING:
+    from sam2.sam2_video_predictor import SAM2VideoPredictor  # type: ignore[import-untyped]
+
+os.environ['TQDM_DISABLE'] = '1'
+
+logger = setup_logger()
+
+
+class SAM2InferenceState(TypedDict):
+    images: list[torch.Tensor | None]
+    num_frames: int
+    cached_features: dict[int, Any]
+
+
+class EdgeTAMProcessor(Detector):
+    _predictor: "SAM2VideoPredictor"
+    _inference_state: SAM2InferenceState | None
+    _frame_count: int
+    _is_tracking: bool
+    _buffer_size: int
+
+    def __init__(
+        self,
+    ) -> None:
+        local_config_path = Path(__file__).parent / "configs" / "edgetam.yaml"
+
+        if not local_config_path.exists():
+            raise FileNotFoundError(f"EdgeTAM config not found at {local_config_path}")
+
+        if not torch.cuda.is_available():
+            raise RuntimeError("EdgeTAM requires a CUDA-capable GPU")
+
+        cfg = OmegaConf.load(local_config_path)
+
+        overrides = {
+            "model.sam_mask_decoder_extra_args.dynamic_multimask_via_stability": True,
+            "model.sam_mask_decoder_extra_args.dynamic_multimask_stability_delta": 0.05,
+            "model.sam_mask_decoder_extra_args.dynamic_multimask_stability_thresh": 0.98,
+            "model.binarize_mask_from_pts_for_mem_enc": True,
+            "model.fill_hole_area": 8,
+        }
+
+        for key, value in overrides.items():
+            OmegaConf.update(cfg, key, value)
+
+        if cfg.model._target_ != "sam2.sam2_video_predictor.SAM2VideoPredictor":
+            logger.warning(
+                f"Config target is {cfg.model._target_}, forcing SAM2VideoPredictor"
+            )
+            cfg.model._target_ = "sam2.sam2_video_predictor.SAM2VideoPredictor"
+
+        self._predictor = instantiate(cfg.model, _recursive_=True)
+
+        ckpt_path = str(get_data("models_edgetam") / "edgetam.pt")
+
+        sd = torch.load(ckpt_path, map_location="cpu", weights_only=True)["model"]
+        missing_keys, unexpected_keys = self._predictor.load_state_dict(sd)
+        if missing_keys:
+            raise RuntimeError("Missing keys in checkpoint")
+        if unexpected_keys:
+            raise RuntimeError("Unexpected keys in checkpoint")
+
+        self._predictor = self._predictor.to("cuda")
+        self._predictor.eval()
+
+        self._inference_state = None
+        self._frame_count = 0
+        self._is_tracking = False
+        self._buffer_size = 100  # Keep last N frames in memory to avoid OOM
+
+    def _prepare_frame(self, image: Image) -> torch.Tensor:
+        """Prepare frame for SAM2 (resize, normalize, convert to tensor)."""
+
+        cv_image = image.to_opencv()
+        rgb_image = cv2.cvtColor(cv_image, cv2.COLOR_BGR2RGB)
+        pil_image = PILImage.fromarray(rgb_image)
+
+        img_np = np.array(
+            pil_image.resize((self._predictor.image_size, self._predictor.image_size))
+        )
+        img_np = img_np.astype(np.float32) / 255.0
+
+        img_mean = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape(1, 1, 3)
+        img_std = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape(1, 1, 3)
+        img_np -= img_mean
+        img_np /= img_std
+
+        img_tensor = torch.from_numpy(img_np).permute(2, 0, 1).float()
+        img_tensor = img_tensor.cuda()
+
+        return img_tensor
+
+    def init_track(
+        self,
+        image: Image,
+        points: NDArray[np.floating[Any]] | None = None,
+        labels: NDArray[np.integer[Any]] | None = None,
+        box: NDArray[np.floating[Any]] | None = None,
+        obj_id: int = 1,
+    ) -> ImageDetections2D:
+        """Initialize tracking with a prompt (points or box).
+
+        Args:
+            image: Initial frame to start tracking from
+            points: Point prompts for segmentation (Nx2 array of [x, y] coordinates)
+            labels: Labels for points (1 = foreground, 0 = background)
+            box: Bounding box prompt in [x1, y1, x2, y2] format
+            obj_id: Object ID for tracking
+
+        Returns:
+            ImageDetections2D with initial segmentation mask
+        """
+        if self._inference_state is not None:
+            self.stop()
+
+        self._frame_count = 0
+
+        with _temp_dir_context(image) as video_path:
+            self._inference_state = self._predictor.init_state(video_path=video_path)
+
+        self._predictor.reset_state(self._inference_state)
+
+        if torch.is_tensor(self._inference_state["images"]):
+            self._inference_state["images"] = [self._inference_state["images"][0]]
+
+        self._is_tracking = True
+
+        if points is not None:
+            points = points.astype(np.float32)
+        if labels is not None:
+            labels = labels.astype(np.int32)
+        if box is not None:
+            box = box.astype(np.float32)
+
+        with torch.no_grad():
+            _, out_obj_ids, out_mask_logits = self._predictor.add_new_points_or_box(
+                inference_state=self._inference_state,
+                frame_idx=0,
+                obj_id=obj_id,
+                points=points,
+                labels=labels,
+                box=box,
+            )
+
+        return self._process_results(image, out_obj_ids, out_mask_logits)
+
+    def process_image(self, image: Image) -> ImageDetections2D:
+        """Process a new video frame and propagate tracking.
+
+        Args:
+            image: New frame to process
+
+        Returns:
+            ImageDetections2D with tracked object segmentation masks
+        """
+        if not self._is_tracking or self._inference_state is None:
+            return ImageDetections2D(image=image)
+
+        self._frame_count += 1
+
+        # Append new frame to inference state
+        new_frame_tensor = self._prepare_frame(image)
+        self._inference_state["images"].append(new_frame_tensor)
+        self._inference_state["num_frames"] += 1
+
+        # Memory management
+        cached_features = self._inference_state["cached_features"]
+        if len(cached_features) > self._buffer_size:
+            oldest_frame = min(cached_features.keys())
+            if oldest_frame < self._frame_count - self._buffer_size:
+                del cached_features[oldest_frame]
+
+        if len(self._inference_state["images"]) > self._buffer_size + 10:
+            idx_to_drop = self._frame_count - self._buffer_size - 5
+            if idx_to_drop >= 0 and idx_to_drop < len(self._inference_state["images"]):
+                if self._inference_state["images"][idx_to_drop] is not None:
+                    self._inference_state["images"][idx_to_drop] = None
+
+        detections: ImageDetections2D = ImageDetections2D(image=image)
+
+        with torch.no_grad():
+            for out_frame_idx, out_obj_ids, out_mask_logits in self._predictor.propagate_in_video(
+                self._inference_state, start_frame_idx=self._frame_count, max_frame_num_to_track=1
+            ):
+                if out_frame_idx == self._frame_count:
+                    return self._process_results(image, out_obj_ids, out_mask_logits)
+
+        return detections
+
+    def _process_results(
+        self,
+        image: Image,
+        obj_ids: list[int],
+        mask_logits: torch.Tensor | NDArray[np.floating[Any]],
+    ) -> ImageDetections2D:
+        detections: ImageDetections2D = ImageDetections2D(image=image)
+
+        if len(obj_ids) == 0:
+            return detections
+
+        if isinstance(mask_logits, torch.Tensor):
+            mask_logits = mask_logits.cpu().numpy()
+
+        for i, obj_id in enumerate(obj_ids):
+            mask = mask_logits[i]
+            seg = Detection2DSeg.from_sam2_result(
+                mask=mask,
+                obj_id=obj_id,
+                image=image,
+                name="object",
+            )
+
+            if seg.is_valid():
+                detections.detections.append(seg)
+
+        return detections
+
+    def stop(self) -> None:
+        self._is_tracking = False
+        self._inference_state = None
+
+
+@contextmanager
+def _temp_dir_context(image: Image) -> Generator[str, None, None]:
+    path = tempfile.mkdtemp()
+
+    image.save(f"{path}/00000.jpg")
+
+    try:
+        yield path
+    finally:
+        shutil.rmtree(path)
diff --git a/dimos/navigation/visual_servoing/detection_navigation.py b/dimos/navigation/visual_servoing/detection_navigation.py
new file mode 100644
index 0000000000..5f89bd1faa
--- /dev/null
+++ b/dimos/navigation/visual_servoing/detection_navigation.py
@@ -0,0 +1,208 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dimos_lcm.sensor_msgs import CameraInfo as DimosLcmCameraInfo
+import numpy as np
+
+from dimos.msgs.geometry_msgs import Transform, Twist, Vector3
+from dimos.msgs.sensor_msgs import CameraInfo, Image, PointCloud2
+from dimos.perception.detection.type.detection2d.bbox import Detection2DBBox
+from dimos.perception.detection.type.detection3d import Detection3DPC
+from dimos.protocol.tf import LCMTF
+from dimos.utils.logging_config import setup_logger
+
+logger = setup_logger()
+
+
+class DetectionNavigation:
+    _target_distance_3d: float = 1.5  # meters to maintain from person
+    _min_distance_3d: float = 0.8  # meters before backing up
+    _max_linear_speed_3d: float = 0.5  # m/s
+    _max_angular_speed_3d: float = 0.8  # rad/s
+    _linear_gain_3d: float = 0.8
+    _angular_gain_3d: float = 1.5
+
+    _tf: LCMTF
+    _camera_info: CameraInfo
+
+    def __init__(self, tf: LCMTF, camera_info: CameraInfo) -> None:
+        self._tf = tf
+        self._camera_info = camera_info
+
+    def compute_twist_for_detection_3d(
+        self, pointcloud: PointCloud2, detection: Detection2DBBox, image: Image
+    ) -> Twist | None:
+        """Project a 2D detection to 3D using pointcloud and compute navigation twist.
+
+        Args:
+            detection: 2D detection with bounding box
+            image: Current image frame
+
+        Returns:
+            Twist command to navigate towards the detection's 3D position.
+        """
+
+        # Get transform from world frame to camera optical frame
+        world_to_optical = self._tf.get(
+            "camera_optical", pointcloud.frame_id, image.ts, time_tolerance=1.0
+        )
+        if world_to_optical is None:
+            logger.warning("Could not get camera transform")
+            return None
+
+        lcm_camera_info = DimosLcmCameraInfo()
+        lcm_camera_info.K = self._camera_info.K
+        lcm_camera_info.width = self._camera_info.width
+        lcm_camera_info.height = self._camera_info.height
+
+        # Project to 3D using the pointcloud
+        detection_3d = Detection3DPC.from_2d(
+            det=detection,
+            world_pointcloud=pointcloud,
+            camera_info=lcm_camera_info,
+            world_to_optical_transform=world_to_optical,
+            filters=[],  # Skip filtering for faster processing in follow loop
+        )
+
+        if detection_3d is None:
+            logger.warning("3D projection failed")
+            return None
+
+        # Get robot position to compute robust target
+        robot_transform = self._tf.get("world", "base_link", time_tolerance=1.0)
+        if robot_transform is None:
+            logger.warning("Could not get robot transform")
+            return None
+
+        robot_pos = robot_transform.translation
+
+        # Compute robust target position using front-most points
+        target_position = self._compute_robust_target_position(detection_3d.pointcloud, robot_pos)
+        if target_position is None:
+            logger.warning("Could not compute robust target position")
+            return None
+
+        return self._compute_twist_from_3d(target_position, robot_transform)
+
+    def _compute_robust_target_position(
+        self, pointcloud: PointCloud2, robot_pos: Vector3
+    ) -> Vector3 | None:
+        """Compute a robust target position from the detection pointcloud.
+
+        Instead of using the centroid of all points (which includes floor/background),
+        this method:
+        1. Filters out floor points (z < 0.3m in world frame)
+        2. Computes distance from robot to each remaining point
+        3. Uses the 25th percentile of closest points to get the front surface
+        4. Returns the centroid of those front-most points
+
+        Args:
+            pointcloud: The detection's pointcloud in world frame
+            robot_pos: Robot's current position in world frame
+
+        Returns:
+            Vector3 position representing the front of the detected object,
+            or None if not enough valid points.
+        """
+        points, _ = pointcloud.as_numpy()
+        if len(points) < 10:
+            return None
+
+        # Filter out floor points (keep points above 0.3m height)
+        height_mask = points[:, 2] > 0.3
+        points = points[height_mask]
+        if len(points) < 10:
+            # Fall back to all points if height filtering removes too many
+            points, _ = pointcloud.as_numpy()
+
+        # Compute 2D distance (XY plane) from robot to each point
+        dx = points[:, 0] - robot_pos.x
+        dy = points[:, 1] - robot_pos.y
+        distances = np.sqrt(dx * dx + dy * dy)
+
+        # Use 25th percentile of distances to find front-most points
+        distance_threshold = np.percentile(distances, 25)
+
+        # Get points that are within the front 25%
+        front_mask = distances <= distance_threshold
+        front_points = points[front_mask]
+
+        if len(front_points) < 3:
+            # Fall back to median distance point
+            median_dist = np.median(distances)
+            close_mask = np.abs(distances - median_dist) < 0.3
+            front_points = points[close_mask]
+            if len(front_points) < 3:
+                return None
+
+        # Compute centroid of front-most points
+        centroid = front_points.mean(axis=0)
+        return Vector3(centroid[0], centroid[1], centroid[2])
+
+    def _compute_twist_from_3d(self, target_position: Vector3, robot_transform: Transform) -> Twist:
+        """Compute twist command to navigate towards a 3D target position.
+
+        Args:
+            target_position: 3D position of the target in world frame.
+            robot_transform: Robot's current transform in world frame.
+
+        Returns:
+            Twist command for the robot.
+        """
+        robot_pos = robot_transform.translation
+
+        # Compute vector from robot to target in world frame
+        dx = target_position.x - robot_pos.x
+        dy = target_position.y - robot_pos.y
+        distance = np.sqrt(dx * dx + dy * dy)
+        print(f"Distance to target: {distance:.2f} m")
+
+        # Compute angle to target in world frame
+        angle_to_target = np.arctan2(dy, dx)
+
+        # Get robot's current heading from transform
+        robot_yaw = robot_transform.rotation.to_euler().z
+
+        # Angle error (how much to turn)
+        angle_error = angle_to_target - robot_yaw
+        # Normalize to [-pi, pi]
+        while angle_error > np.pi:
+            angle_error -= 2 * np.pi
+        while angle_error < -np.pi:
+            angle_error += 2 * np.pi
+
+        # Compute angular velocity (turn towards target)
+        angular_z = angle_error * self._angular_gain_3d
+        angular_z = float(
+            np.clip(angular_z, -self._max_angular_speed_3d, self._max_angular_speed_3d)
+        )
+
+        # Compute linear velocity based on distance
+        distance_error = distance - self._target_distance_3d
+
+        if distance < self._min_distance_3d:
+            # Too close, back up
+            linear_x = -self._max_linear_speed_3d * 0.6
+        else:
+            # Move forward based on distance error, reduce speed when turning
+            turn_factor = 1.0 - min(abs(angle_error) / np.pi, 0.7)
+            linear_x = distance_error * self._linear_gain_3d * turn_factor
+            linear_x = float(
+                np.clip(linear_x, -self._max_linear_speed_3d, self._max_linear_speed_3d)
+            )
+
+        return Twist(
+            linear=Vector3(linear_x, 0.0, 0.0),
+            angular=Vector3(0.0, 0.0, angular_z),
+        )
diff --git a/dimos/navigation/visual_servoing/visual_servoing_2d.py b/dimos/navigation/visual_servoing/visual_servoing_2d.py
new file mode 100644
index 0000000000..032b5f3370
--- /dev/null
+++ b/dimos/navigation/visual_servoing/visual_servoing_2d.py
@@ -0,0 +1,166 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from dimos.msgs.geometry_msgs import Twist, Vector3
+from dimos.msgs.sensor_msgs import CameraInfo
+
+
+class VisualServoing2D:
+    """2D visual servoing controller for tracking objects using bounding boxes.
+
+    Uses camera intrinsics to convert pixel coordinates to normalized camera
+    coordinates and estimates distance based on known object width.
+    """
+
+    # Target distance to maintain from object (meters).
+    _target_distance: float = 1.5
+
+    # Minimum distance before backing up (meters).
+    _min_distance: float = 0.8
+
+    # Maximum forward/backward speed (m/s).
+    _max_linear_speed: float = 0.5
+
+    # Maximum turning speed (rad/s).
+    _max_angular_speed: float = 0.8
+
+    # Assumed real-world width of tracked object (meters).
+    _assumed_object_width: float = 0.45
+
+    # Proportional gain for angular velocity control.
+    _angular_gain: float = 1.0
+
+    # Proportional gain for linear velocity control.
+    _linear_gain: float = 0.8
+
+    # Speed factor when backing up (multiplied by max_linear_speed).
+    _backup_speed_factor: float = 0.6
+
+    # Multiplier for x_norm when calculating turn factor.
+    _turn_factor_multiplier: float = 2.0
+
+    # Maximum speed reduction due to turning (turn_factor ranges from 1-this to 1).
+    _turn_factor_max_reduction: float = 0.7
+
+    _rotation_requires_linear_movement: bool = False
+
+    # Camera intrinsics for coordinate conversion.
+    _camera_info: CameraInfo
+
+    def __init__(
+        self, camera_info: CameraInfo, rotation_requires_linear_movement: bool = False
+    ) -> None:
+        self._camera_info = camera_info
+        self._rotation_requires_linear_movement = rotation_requires_linear_movement
+
+    def compute_twist(
+        self,
+        bbox: tuple[float, float, float, float],
+        image_width: int,
+    ) -> Twist:
+        """Compute twist command to servo towards the tracked object.
+
+        Args:
+            bbox: Bounding box (x1, y1, x2, y2) in pixels.
+            image_width: Width of the image.
+
+        Returns:
+            Twist command for the robot.
+        """
+        x1, _, x2, _ = bbox
+        bbox_center_x = (x1 + x2) / 2.0
+
+        # Get normalized x coordinate using inverse K matrix
+        # Positive = object is to the right of optical center
+        x_norm = self._get_normalized_x(bbox_center_x)
+
+        estimated_distance = self._estimate_distance(bbox)
+
+        if estimated_distance is None:
+            return Twist.zero()
+
+        # Calculate distance error (positive = too far, need to move forward)
+        distance_error = estimated_distance - self._target_distance
+
+        # Compute angular velocity (turn towards object)
+        # Negative because positive angular.z is counter-clockwise (left turn)
+        angular_z = -x_norm * self._angular_gain
+        angular_z = float(np.clip(angular_z, -self._max_angular_speed, self._max_angular_speed))
+
+        # Compute linear velocity - ALWAYS move forward/backward based on distance.
+        # Reduce forward speed when turning sharply to maintain stability.
+        turn_factor = 1.0 - min(
+            abs(x_norm) * self._turn_factor_multiplier, self._turn_factor_max_reduction
+        )
+
+        if estimated_distance < self._min_distance:
+            # Too close, back up (don't reduce speed for backing up)
+            linear_x = -self._max_linear_speed * self._backup_speed_factor
+        else:
+            # Move forward based on distance error with proportional gain
+            linear_x = distance_error * self._linear_gain * turn_factor
+            linear_x = float(np.clip(linear_x, -self._max_linear_speed, self._max_linear_speed))
+
+        # Enforce minimum linear speed when turning
+        if self._rotation_requires_linear_movement and abs(angular_z) < 0.02:
+            linear_x = max(linear_x, 0.1)
+
+        return Twist(
+            linear=Vector3(linear_x, 0.0, 0.0),
+            angular=Vector3(0.0, 0.0, angular_z),
+        )
+
+    def _get_normalized_x(self, pixel_x: float) -> float:
+        """Convert pixel x coordinate to normalized camera coordinate.
+
+        Uses inverse K matrix: x_norm = (pixel_x - cx) / fx
+
+        Args:
+            pixel_x: x coordinate in pixels
+
+        Returns:
+            Normalized x coordinate (tan of angle from optical center)
+        """
+        fx = self._camera_info.K[0]  # focal length x
+        cx = self._camera_info.K[2]  # optical center x
+        return (pixel_x - cx) / fx
+
+    def _estimate_distance(self, bbox: tuple[float, float, float, float]) -> float | None:
+        """Estimate distance to object based on bounding box size and camera intrinsics.
+
+        Uses the pinhole camera model:
+            pixel_width / fx = real_width / distance
+            distance = (real_width * fx) / pixel_width
+
+        Uses bbox width instead of height because ground robot can't see full
+        person height when close. Width (shoulders) is more consistently visible.
+
+        Args:
+            bbox: Bounding box (x1, y1, x2, y2) in pixels.
+
+        Returns:
+            Estimated distance in meters, or None if bbox is invalid.
+        """
+        bbox_width = bbox[2] - bbox[0]  # x2 - x1
+
+        if bbox_width <= 0:
+            return None
+
+        # Pinhole camera model: distance = (real_width * fx) / pixel_width
+        fx = self._camera_info.K[0]  # focal length x in pixels
+        estimated_distance = (self._assumed_object_width * fx) / bbox_width
+
+        return estimated_distance
diff --git a/dimos/robot/unitree_webrtc/mujoco_connection.py b/dimos/robot/unitree_webrtc/mujoco_connection.py
index a8e5b5e6d4..f31d0599c1 100644
--- a/dimos/robot/unitree_webrtc/mujoco_connection.py
+++ b/dimos/robot/unitree_webrtc/mujoco_connection.py
@@ -34,10 +34,17 @@
 
 from dimos.core.global_config import GlobalConfig
 from dimos.msgs.geometry_msgs import Quaternion, Twist, Vector3
-from dimos.msgs.sensor_msgs import Image, ImageFormat
+from dimos.msgs.sensor_msgs import CameraInfo, Image, ImageFormat
 from dimos.robot.unitree_webrtc.type.lidar import LidarMessage
 from dimos.robot.unitree_webrtc.type.odometry import Odometry
-from dimos.simulation.mujoco.constants import LAUNCHER_PATH, LIDAR_FPS, VIDEO_FPS
+from dimos.simulation.mujoco.constants import (
+    LAUNCHER_PATH,
+    LIDAR_FPS,
+    VIDEO_CAMERA_FOV,
+    VIDEO_FPS,
+    VIDEO_HEIGHT,
+    VIDEO_WIDTH,
+)
 from dimos.simulation.mujoco.shared_memory import ShmWriter
 from dimos.utils.data import get_data
 from dimos.utils.logging_config import setup_logger
@@ -79,6 +86,32 @@ def __init__(self, global_config: GlobalConfig) -> None:
         self._stop_events: list[threading.Event] = []
         self._is_cleaned_up = False
 
+    @staticmethod
+    def _compute_camera_info() -> CameraInfo:
+        """Compute camera intrinsics from MuJoCo camera parameters.
+
+        Uses pinhole camera model: f = height / (2 * tan(fovy / 2))
+        """
+        import math
+
+        fovy = math.radians(VIDEO_CAMERA_FOV)
+        f = VIDEO_HEIGHT / (2 * math.tan(fovy / 2))
+        cx = VIDEO_WIDTH / 2.0
+        cy = VIDEO_HEIGHT / 2.0
+
+        return CameraInfo(
+            frame_id="camera_optical",
+            height=VIDEO_HEIGHT,
+            width=VIDEO_WIDTH,
+            distortion_model="plumb_bob",
+            D=[0.0, 0.0, 0.0, 0.0, 0.0],
+            K=[f, 0.0, cx, 0.0, f, cy, 0.0, 0.0, 1.0],
+            R=[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0],
+            P=[f, 0.0, cx, 0.0, 0.0, f, cy, 0.0, 0.0, 0.0, 1.0, 0.0],
+        )
+
+    camera_info_static: CameraInfo = _compute_camera_info()
+
     def start(self) -> None:
         self.shm_data = ShmWriter()
 
diff --git a/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py b/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py
index be53702412..8a017ad004 100644
--- a/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py
+++ b/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py
@@ -27,6 +27,7 @@
 from dimos.agents.cli.web import web_input
 from dimos.agents.ollama_agent import ollama_installed
 from dimos.agents.skills.navigation import navigation_skill
+from dimos.agents.skills.person_follow import person_follow_skill
 from dimos.agents.skills.speak_skill import speak_skill
 from dimos.agents.spec import Provider
 from dimos.agents.vlm_agent import vlm_agent
@@ -142,7 +143,13 @@
 
 spatial = autoconnect(
     nav,
-    spatial_memory(),
+    # TODO: Turn back on. Turned off to stop logging.
+    # EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
+    # EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
+    # EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
+    # EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
+    # EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE
+    # spatial_memory(),
     utilization(),
 ).global_config(n_dask_workers=8)
 
@@ -168,6 +175,7 @@
 _common_agentic = autoconnect(
     human_input(),
     navigation_skill(),
+    person_follow_skill(camera_info=GO2Connection.camera_info_static),
     unitree_skills(),
     web_input(),
     speak_skill(),
diff --git a/dimos/simulation/mujoco/constants.py b/dimos/simulation/mujoco/constants.py
index aca916a372..4e35011530 100644
--- a/dimos/simulation/mujoco/constants.py
+++ b/dimos/simulation/mujoco/constants.py
@@ -17,6 +17,7 @@
 # Video/Camera constants
 VIDEO_WIDTH = 320
 VIDEO_HEIGHT = 240
+VIDEO_CAMERA_FOV = 45  # MuJoCo default FOV for head_camera (degrees)
 DEPTH_CAMERA_FOV = 160
 
 # Depth camera range/filtering constants
diff --git a/dimos/simulation/mujoco/model.py b/dimos/simulation/mujoco/model.py
index de533521da..bc309b7307 100644
--- a/dimos/simulation/mujoco/model.py
+++ b/dimos/simulation/mujoco/model.py
@@ -37,14 +37,21 @@ def _get_data_dir() -> epath.Path:
 
 def get_assets() -> dict[str, bytes]:
     data_dir = _get_data_dir()
+    assets: dict[str, bytes] = {}
+
     # Assets used from https://sketchfab.com/3d-models/mersus-office-8714be387bcd406898b2615f7dae3a47
     # Created by Ryan Cassidy and Coleman Costello
-    assets: dict[str, bytes] = {}
     mjx_env.update_assets(assets, data_dir, "*.xml")
     mjx_env.update_assets(assets, data_dir / "scene_office1/textures", "*.png")
     mjx_env.update_assets(assets, data_dir / "scene_office1/office_split", "*.obj")
     mjx_env.update_assets(assets, mjx_env.MENAGERIE_PATH / "unitree_go1" / "assets")
     mjx_env.update_assets(assets, mjx_env.MENAGERIE_PATH / "unitree_g1" / "assets")
+
+    # From: https://sketchfab.com/3d-models/jeong-seun-34-42956ca979404a038b8e0d3e496160fd
+    person_dir = epath.Path(str(get_data("person")))
+    mjx_env.update_assets(assets, person_dir, "*.obj")
+    mjx_env.update_assets(assets, person_dir, "*.png")
+
     return assets
 
 
@@ -106,9 +113,38 @@ def get_model_xml(robot: str, scene_xml: str) -> str:
     map_elem.set("znear", "0.01")
     map_elem.set("zfar", "10000")
 
+    _add_person_object(root)
+
     return ET.tostring(root, encoding="unicode")
 
 
+def _add_person_object(root: ET.Element) -> None:
+    asset = root.find("asset")
+
+    if asset is None:
+        asset = ET.SubElement(root, "asset")
+
+    ET.SubElement(asset, "mesh", name="person_mesh", file="jeong_seun_34.obj")
+    ET.SubElement(asset, "texture", name="person_texture", file="material_0.png", type="2d")
+    ET.SubElement(asset, "material", name="person_material", texture="person_texture")
+
+    worldbody = root.find("worldbody")
+
+    if worldbody is None:
+        worldbody = ET.SubElement(root, "worldbody")
+
+    person_body = ET.SubElement(worldbody, "body", name="person", pos="0 0 0", mocap="true")
+
+    ET.SubElement(
+        person_body,
+        "geom",
+        type="mesh",
+        mesh="person_mesh",
+        material="person_material",
+        euler="1.5708 0 0",
+    )
+
+
 def load_scene_xml(config: GlobalConfig) -> str:
     if config.mujoco_room_from_occupancy:
         path = Path(config.mujoco_room_from_occupancy)
diff --git a/dimos/simulation/mujoco/mujoco_process.py b/dimos/simulation/mujoco/mujoco_process.py
index 2363a8abd3..4bfcd6deda 100755
--- a/dimos/simulation/mujoco/mujoco_process.py
+++ b/dimos/simulation/mujoco/mujoco_process.py
@@ -41,6 +41,7 @@
 )
 from dimos.simulation.mujoco.depth_camera import depth_image_to_point_cloud
 from dimos.simulation.mujoco.model import load_model, load_scene_xml
+from dimos.simulation.mujoco.person_on_track import PersonPositionController
 from dimos.simulation.mujoco.shared_memory import ShmReader
 from dimos.utils.logging_config import setup_logger
 
@@ -97,6 +98,9 @@ def _run_simulation(config: GlobalConfig, shm: ShmReader) -> None:
 
     camera_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_CAMERA, "head_camera")
     lidar_camera_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_CAMERA, "lidar_front_camera")
+
+    person_position_controller = PersonPositionController(model)
+
     lidar_left_camera_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_CAMERA, "lidar_left_camera")
     lidar_right_camera_id = mujoco.mj_name2id(
         model, mujoco.mjtObj.mjOBJ_CAMERA, "lidar_right_camera"
@@ -138,6 +142,8 @@ def _run_simulation(config: GlobalConfig, shm: ShmReader) -> None:
             for _ in range(config.mujoco_steps_per_frame):
                 mujoco.mj_step(model, data)
 
+            person_position_controller.tick(data)
+
             m_viewer.sync()
 
             # Always update odometry
@@ -220,6 +226,8 @@ def _run_simulation(config: GlobalConfig, shm: ShmReader) -> None:
             if time_until_next_step > 0:
                 time.sleep(time_until_next_step)
 
+        person_position_controller.stop()
+
 
 if __name__ == "__main__":
 
diff --git a/dimos/simulation/mujoco/person_on_track.py b/dimos/simulation/mujoco/person_on_track.py
new file mode 100644
index 0000000000..a816b5f3ee
--- /dev/null
+++ b/dimos/simulation/mujoco/person_on_track.py
@@ -0,0 +1,160 @@
+# Copyright 2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+import mujoco
+import numpy as np
+from numpy.typing import NDArray
+
+from dimos.core.transport import LCMTransport
+from dimos.msgs.geometry_msgs import Pose
+
+
+class PersonPositionController:
+    """Controls the person position in MuJoCo by subscribing to LCM pose updates."""
+
+    def __init__(self, model: mujoco.MjModel) -> None:
+        person_body_id = mujoco.mj_name2id(model, mujoco.mjtObj.mjOBJ_BODY, "person")
+        self._person_mocap_id = model.body_mocapid[person_body_id]
+        self._latest_pose: Pose | None = None
+        self._transport: LCMTransport[Pose] = LCMTransport("/person_pose", Pose)
+        self._transport.subscribe(self._on_pose)
+
+    def _on_pose(self, pose: Pose) -> None:
+        self._latest_pose = pose
+
+    def tick(self, data: mujoco.MjData) -> None:
+        if self._latest_pose is None:
+            return
+
+        pose = self._latest_pose
+        data.mocap_pos[self._person_mocap_id][0] = pose.position.x
+        data.mocap_pos[self._person_mocap_id][1] = pose.position.y
+        data.mocap_pos[self._person_mocap_id][2] = pose.position.z
+        data.mocap_quat[self._person_mocap_id] = [
+            pose.orientation.w,
+            pose.orientation.x,
+            pose.orientation.y,
+            pose.orientation.z,
+        ]
+
+    def stop(self) -> None:
+        self._transport.stop()
+
+
+class PersonTrackPublisher:
+    """Publishes person poses along a track via LCM."""
+
+    def __init__(self, track: list[tuple[float, float]]) -> None:
+        self._speed = 0.004
+        self._waypoint_threshold = 0.1
+        self._rotation_radius = 1.0
+        self._track = track
+        self._current_waypoint_idx = 0
+        self._initialized = False
+        self._current_pos = np.array([0.0, 0.0])
+        self._transport: LCMTransport[Pose] = LCMTransport("/person_pose", Pose)
+
+    def _get_segment_heading(self, from_idx: int, to_idx: int) -> float:
+        """Get heading angle for traveling from one waypoint to another."""
+        from_wp = np.array(self._track[from_idx])
+        to_wp = np.array(self._track[to_idx])
+        direction = to_wp - from_wp
+        return float(np.arctan2(direction[1], direction[0]))
+
+    def _lerp_angle(self, a1: float, a2: float, t: float) -> float:
+        """Interpolate between two angles, handling wrapping."""
+        diff = a2 - a1
+        while diff > np.pi:
+            diff -= 2 * np.pi
+        while diff < -np.pi:
+            diff += 2 * np.pi
+        return a1 + diff * t
+
+    def tick(self) -> None:
+        if not self._initialized:
+            first_point = self._track[0]
+            self._current_pos = np.array([first_point[0], first_point[1]])
+            self._current_waypoint_idx = 1
+            heading = self._get_segment_heading(0, 1)
+            self._publish_pose(self._current_pos, heading)
+            self._initialized = True
+            return
+
+        n = len(self._track)
+
+        prev_idx = (self._current_waypoint_idx - 1) % n
+        curr_idx = self._current_waypoint_idx
+        next_idx = (self._current_waypoint_idx + 1) % n
+        prev_prev_idx = (prev_idx - 1) % n
+
+        prev_wp = np.array(self._track[prev_idx])
+        curr_wp = np.array(self._track[curr_idx])
+
+        to_target = curr_wp - self._current_pos
+        distance_to_curr = float(np.linalg.norm(to_target))
+        distance_from_prev = float(np.linalg.norm(self._current_pos - prev_wp))
+
+        # Headings for current turn (at curr_wp)
+        incoming_heading = self._get_segment_heading(prev_idx, curr_idx)
+        outgoing_heading = self._get_segment_heading(curr_idx, next_idx)
+
+        # Headings for previous turn (at prev_wp)
+        prev_incoming_heading = self._get_segment_heading(prev_prev_idx, prev_idx)
+        prev_outgoing_heading = incoming_heading
+
+        # Determine heading based on position in rotation zones
+        in_leaving_zone = distance_from_prev < self._rotation_radius
+        in_approaching_zone = distance_to_curr < self._rotation_radius
+
+        if in_leaving_zone and in_approaching_zone:
+            # Overlap - prioritize approaching zone
+            t = 0.5 * (1.0 - distance_to_curr / self._rotation_radius)
+            heading = self._lerp_angle(incoming_heading, outgoing_heading, t)
+        elif in_leaving_zone:
+            # Finishing turn after passing prev_wp (t goes from 0.5 to 1.0)
+            t = 0.5 + 0.5 * (distance_from_prev / self._rotation_radius)
+            heading = self._lerp_angle(prev_incoming_heading, prev_outgoing_heading, t)
+        elif in_approaching_zone:
+            # Starting turn before reaching curr_wp (t goes from 0.0 to 0.5)
+            t = 0.5 * (1.0 - distance_to_curr / self._rotation_radius)
+            heading = self._lerp_angle(incoming_heading, outgoing_heading, t)
+        else:
+            # Between zones, use segment heading
+            heading = incoming_heading
+
+        # Move toward target
+        if distance_to_curr > 0:
+            dir_norm = to_target / distance_to_curr
+            self._current_pos[0] += dir_norm[0] * self._speed
+            self._current_pos[1] += dir_norm[1] * self._speed
+
+        # Check if reached waypoint
+        if distance_to_curr < self._waypoint_threshold:
+            self._current_waypoint_idx = next_idx
+
+        # Publish pose
+        self._publish_pose(self._current_pos, heading + np.pi)
+
+    def _publish_pose(self, pos: NDArray[np.floating[Any]], heading: float) -> None:
+        c, s = np.cos(heading / 2), np.sin(heading / 2)
+        pose = Pose(
+            position=[pos[0], pos[1], 0.0],
+            orientation=[0.0, 0.0, s, c],  # x, y, z, w
+        )
+        self._transport.broadcast(None, pose)
+
+    def stop(self) -> None:
+        self._transport.stop()
diff --git a/pyproject.toml b/pyproject.toml
index 65e586b7ac..9f6f4bc417 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -101,7 +101,10 @@ misc = [
     # Perception Dependencies
     "scikit-learn",
     "timm>=1.0.15",
+    "edgetam-dimos",
     "opencv-contrib-python==4.10.0.84",
+    "omegaconf>=2.3.0",
+    "hydra-core>=1.3.0",
 
     # embedding models
     "open_clip_torch==3.2.0",
diff --git a/uv.lock b/uv.lock
index 700d8fbfd6..5274ec5c08 100644
--- a/uv.lock
+++ b/uv.lock
@@ -134,6 +134,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/60/1c/1cd02b7ae64302a6e06724bf80a96401d5313708651d277b1458504a1730/anthropic-0.75.0-py3-none-any.whl", hash = "sha256:ea8317271b6c15d80225a9f3c670152746e88805a7a61e14d4a374577164965b", size = 388164, upload-time = "2025-11-24T20:41:43.587Z" },
 ]
 
+[[package]]
+name = "antlr4-python3-runtime"
+version = "4.9.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3e/38/7859ff46355f76f8d19459005ca000b6e7012f2f1ca597746cbcd1fbfe5e/antlr4-python3-runtime-4.9.3.tar.gz", hash = "sha256:f224469b4168294902bb1efa80a8bf7855f24c99aef99cbefc1bcd3cce77881b", size = 117034, upload-time = "2021-11-06T17:52:23.524Z" }
+
 [[package]]
 name = "anyio"
 version = "4.12.1"
@@ -1520,12 +1526,15 @@ manipulation = [
 misc = [
     { name = "catkin-pkg" },
     { name = "cerebras-cloud-sdk" },
+    { name = "edgetam-dimos" },
     { name = "einops" },
     { name = "empy" },
     { name = "gdown" },
     { name = "googlemaps" },
+    { name = "hydra-core" },
     { name = "ipykernel" },
     { name = "lark" },
+    { name = "omegaconf" },
     { name = "onnx" },
     { name = "open-clip-torch" },
     { name = "opencv-contrib-python" },
@@ -1615,6 +1624,7 @@ requires-dist = [
     { name = "dimos", extras = ["agents", "web", "perception", "visualization", "sim"], marker = "extra == 'base'" },
     { name = "dimos", extras = ["base"], marker = "extra == 'unitree'" },
     { name = "dimos-lcm" },
+    { name = "edgetam-dimos", marker = "extra == 'misc'" },
     { name = "einops", marker = "extra == 'misc'", specifier = "==0.8.1" },
     { name = "empy", marker = "extra == 'misc'", specifier = "==3.3.4" },
     { name = "fastapi", marker = "extra == 'web'", specifier = ">=0.115.6" },
@@ -1623,6 +1633,7 @@ requires-dist = [
     { name = "gdown", marker = "extra == 'misc'", specifier = "==5.2.0" },
     { name = "googlemaps", marker = "extra == 'misc'", specifier = ">=4.10.0" },
     { name = "h5py", marker = "extra == 'manipulation'", specifier = ">=3.7.0" },
+    { name = "hydra-core", marker = "extra == 'misc'", specifier = ">=1.3.0" },
     { name = "ipykernel", marker = "extra == 'misc'" },
     { name = "kaleido", marker = "extra == 'manipulation'", specifier = ">=0.2.1" },
     { name = "langchain", marker = "extra == 'agents'", specifier = ">=1,<2" },
@@ -1647,6 +1658,7 @@ requires-dist = [
     { name = "numpy", specifier = ">=1.26.4" },
     { name = "nvidia-nvimgcodec-cu12", extras = ["all"], marker = "extra == 'cuda'" },
     { name = "ollama", marker = "extra == 'agents'", specifier = ">=0.6.0" },
+    { name = "omegaconf", marker = "extra == 'misc'", specifier = ">=2.3.0" },
     { name = "onnx", marker = "extra == 'misc'" },
     { name = "onnxruntime", marker = "extra == 'cpu'" },
     { name = "onnxruntime-gpu", marker = "extra == 'cuda'", specifier = ">=1.17.1" },
@@ -1829,6 +1841,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b0/0d/9feae160378a3553fa9a339b0e9c1a048e147a4127210e286ef18b730f03/durationpy-0.10-py3-none-any.whl", hash = "sha256:3b41e1b601234296b4fb368338fdcd3e13e0b4fb5b67345948f4f2bf9868b286", size = 3922, upload-time = "2025-05-17T13:52:36.463Z" },
 ]
 
+[[package]]
+name = "edgetam-dimos"
+version = "1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "hydra-core" },
+    { name = "iopath" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "numpy", version = "2.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "pillow" },
+    { name = "torch" },
+    { name = "torchvision" },
+    { name = "tqdm" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7a/ea/bec55e18e19b6e43ed5f18bfcb699933ab82744fa8a52209ac6e94a6d6d8/edgetam_dimos-1.0.tar.gz", hash = "sha256:4fea5fd5a5aa17f9145dc4f35abc41de9426acaa0d59cae9b467cf26e657d4a7", size = 74935, upload-time = "2026-01-19T22:53:39.159Z" }
+
 [[package]]
 name = "einops"
 version = "0.8.1"
@@ -2689,6 +2717,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c5/7b/bca5613a0c3b542420cf92bd5e5fb8ebd5435ce1011a091f66bb7693285e/humanize-4.15.0-py3-none-any.whl", hash = "sha256:b1186eb9f5a9749cd9cb8565aee77919dd7c8d076161cf44d70e59e3301e1769", size = 132203, upload-time = "2025-12-20T20:16:11.67Z" },
 ]
 
+[[package]]
+name = "hydra-core"
+version = "1.3.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "antlr4-python3-runtime" },
+    { name = "omegaconf" },
+    { name = "packaging" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6d/8e/07e42bc434a847154083b315779b0a81d567154504624e181caf2c71cd98/hydra-core-1.3.2.tar.gz", hash = "sha256:8a878ed67216997c3e9d88a8e72e7b4767e81af37afb4ea3334b269a4390a824", size = 3263494, upload-time = "2023-02-23T18:33:43.03Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c6/50/e0edd38dcd63fb26a8547f13d28f7a008bc4a3fd4eb4ff030673f22ad41a/hydra_core-1.3.2-py3-none-any.whl", hash = "sha256:fa0238a9e31df3373b35b0bfb672c34cc92718d21f81311d8996a16de1141d8b", size = 154547, upload-time = "2023-02-23T18:33:40.801Z" },
+]
+
 [[package]]
 name = "identify"
 version = "2.6.15"
@@ -2760,6 +2802,17 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" },
 ]
 
+[[package]]
+name = "iopath"
+version = "0.1.10"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "portalocker" },
+    { name = "tqdm" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/72/73/b3d451dfc523756cf177d3ebb0af76dc7751b341c60e2a21871be400ae29/iopath-0.1.10.tar.gz", hash = "sha256:3311c16a4d9137223e20f141655759933e1eda24f8bff166af834af3c645ef01", size = 42226, upload-time = "2022-07-09T19:00:50.866Z" }
+
 [[package]]
 name = "ipykernel"
 version = "7.1.0"
@@ -5210,6 +5263,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/47/4f/4a617ee93d8208d2bcf26b2d8b9402ceaed03e3853c754940e2290fed063/ollama-0.6.1-py3-none-any.whl", hash = "sha256:fc4c984b345735c5486faeee67d8a265214a31cbb828167782dc642ce0a2bf8c", size = 14354, upload-time = "2025-11-13T23:02:16.292Z" },
 ]
 
+[[package]]
+name = "omegaconf"
+version = "2.3.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "antlr4-python3-runtime" },
+    { name = "pyyaml" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/09/48/6388f1bb9da707110532cb70ec4d2822858ddfb44f1cdf1233c20a80ea4b/omegaconf-2.3.0.tar.gz", hash = "sha256:d5d4b6d29955cc50ad50c46dc269bcd92c6e00f5f90d23ab5fee7bfca4ba4cc7", size = 3298120, upload-time = "2022-12-08T20:59:22.753Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e3/94/1843518e420fa3ed6919835845df698c7e27e183cb997394e4a670973a65/omegaconf-2.3.0-py3-none-any.whl", hash = "sha256:7b4df175cdb08ba400f45cae3bdcae7ba8365db4d165fc65fd04b050ab63b46b", size = 79500, upload-time = "2022-12-08T20:59:19.686Z" },
+]
+
 [[package]]
 name = "onnx"
 version = "1.20.0"
@@ -6023,6 +6089,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d1/d8/a12e6aa14f63784cead437083319ec7cece0d5bb9a5bfe7678cc6578b52a/polars_runtime_32-1.36.1-cp39-abi3-win_arm64.whl", hash = "sha256:809e73857be71250141225ddd5d2b30c97e6340aeaa0d445f930e01bef6888dc", size = 39798896, upload-time = "2025-12-10T01:14:11.568Z" },
 ]
 
+[[package]]
+name = "portalocker"
+version = "3.2.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pywin32", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/5e/77/65b857a69ed876e1951e88aaba60f5ce6120c33703f7cb61a3c894b8c1b6/portalocker-3.2.0.tar.gz", hash = "sha256:1f3002956a54a8c3730586c5c77bf18fae4149e07eaf1c29fc3faf4d5a3f89ac", size = 95644, upload-time = "2025-06-14T13:20:40.03Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4b/a6/38c8e2f318bf67d338f4d629e93b0b4b9af331f455f0390ea8ce4a099b26/portalocker-3.2.0-py3-none-any.whl", hash = "sha256:3cdc5f565312224bc570c49337bd21428bba0ef363bbcf58b9ef4a9f11779968", size = 22424, upload-time = "2025-06-14T13:20:38.083Z" },
+]
+
 [[package]]
 name = "posthog"
 version = "5.4.0"