diff --git a/assets/agent/prompt.txt b/assets/agent/prompt.txt
index 8a2ccff020..b3d159a7df 100644
--- a/assets/agent/prompt.txt
+++ b/assets/agent/prompt.txt
@@ -64,7 +64,13 @@ Saved Robot Locations:
 
 ***ALWAYS CHECK FIRST if you can find a navigation query in the Saved Robot Locations before running the NavigateWithText tool call. If a saved location is found, get there with NavigateToGoal.***
 
-***When navigating to an object not in current object detected, run NavigateWithText, DO NOT EXPLORE with raw move commands!!!
+***Don't use object detections for navigating to an object, ALWAYS run NavigateWithText. Only use object detections if NavigateWithText fails***
+
+***When running NavigateWithText, set skip_visual_search flag to TRUE if the query is a general location such as kitchen or office, if it fails, then run without this flag***
+
+***When navigating to an object not in current object detected, run NavigateWithText, DO NOT EXPLORE with raw move commands!!!***
+
+***The object detection list is not a comprehensive source of information, when given a visual query like "go to the person wearing a hat" or "Do you see a dog", always Prioritize running observe skill and NavigateWithText***
 
 PLANNING & REASONING:
 - You can develop both short-term and long-term plans to achieve complex goals
diff --git a/dimos/agents/claude_agent.py b/dimos/agents/claude_agent.py
index 53148d17b3..e87b1f47b4 100644
--- a/dimos/agents/claude_agent.py
+++ b/dimos/agents/claude_agent.py
@@ -624,8 +624,11 @@ def _observable_query(
             observer.on_completed()
         except Exception as e:
             logger.error(f"Query failed in {self.dev_name}: {e}")
-            observer.on_error(e)
-            self.response_subject.on_error(e)
+            # Send a user-friendly error message instead of propagating the error
+            error_message = "I apologize, but I'm having trouble processing your request right now. Please try again."
+            observer.on_next(error_message)
+            self.response_subject.on_next(error_message)
+            observer.on_completed()
 
     def _handle_tooling(self, response_message, messages):
         """Executes tools and appends tool-use/result blocks to messages."""
@@ -649,12 +652,36 @@ def _handle_tooling(self, response_message, messages):
             }
             messages.append({"role": "assistant", "content": [tool_use_block]})
 
-            # Execute the tool
-            args = json.loads(tool_call.function.arguments)
-            tool_result = self.skills.call(tool_call.function.name, **args)
-
-            # Add tool result to conversation history
-            if tool_result:
+            try:
+                # Execute the tool
+                args = json.loads(tool_call.function.arguments)
+                tool_result = self.skills.call(tool_call.function.name, **args)
+
+                # Check if the result is an error message
+                if isinstance(tool_result, str) and (
+                    "Error executing skill" in tool_result or "is not available" in tool_result
+                ):
+                    # Log the error but provide a user-friendly message
+                    logger.error(f"Tool execution failed: {tool_result}")
+                    tool_result = "I apologize, but I'm having trouble executing that action right now. Please try again or ask for something else."
+
+                # Add tool result to conversation history
+                if tool_result:
+                    messages.append(
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "tool_result",
+                                    "tool_use_id": tool_call.id,
+                                    "content": f"{tool_result}",
+                                }
+                            ],
+                        }
+                    )
+            except Exception as e:
+                logger.error(f"Unexpected error executing tool {tool_call.function.name}: {e}")
+                # Add error result to conversation history
                 messages.append(
                     {
                         "role": "user",
@@ -662,7 +689,7 @@ def _handle_tooling(self, response_message, messages):
                             {
                                 "type": "tool_result",
                                 "tool_use_id": tool_call.id,
-                                "content": f"{tool_result}",
+                                "content": "I apologize, but I encountered an error while trying to execute that action. Please try again.",
                             }
                         ],
                     }
@@ -672,13 +699,19 @@ def _tooling_callback(self, response_message):
         """Runs the observable query for each tool call in the current response_message"""
         if not hasattr(response_message, "tool_calls") or not response_message.tool_calls:
             return
-        for tool_call in response_message.tool_calls:
-            tool_name = tool_call.function.name
-            tool_id = tool_call.id
-            self.run_observable_query(
-                query_text=f"Tool {tool_name}, ID: {tool_id} execution complete. Please summarize the results and continue.",
-                thinking_budget_tokens=0,
-            ).run()
+
+        try:
+            for tool_call in response_message.tool_calls:
+                tool_name = tool_call.function.name
+                tool_id = tool_call.id
+                self.run_observable_query(
+                    query_text=f"Tool {tool_name}, ID: {tool_id} execution complete. Please summarize the results and continue.",
+                    thinking_budget_tokens=0,
+                ).run()
+        except Exception as e:
+            logger.error(f"Error in tooling callback: {e}")
+            # Continue processing even if the callback fails
+            pass
 
     def _debug_api_call(self, claude_params: dict):
         """Debugging function to log API calls with truncated base64 data."""
diff --git a/dimos/models/qwen/video_query.py b/dimos/models/qwen/video_query.py
index 3471e52268..7eda5f1aed 100644
--- a/dimos/models/qwen/video_query.py
+++ b/dimos/models/qwen/video_query.py
@@ -128,7 +128,7 @@ def query_single_frame(
         openai_client=qwen_client,
         model_name=model_name,
         tokenizer=HuggingFaceTokenizer(model_name=f"Qwen/{model_name}"),
-        max_output_tokens_per_request=100,
+        max_output_tokens_per_request=8192,
         system_query=query,
         pool_scheduler=get_scheduler(),
     )
diff --git a/dimos/perception/object_detection_stream.py b/dimos/perception/object_detection_stream.py
index 552a31acfe..3284d99f8b 100644
--- a/dimos/perception/object_detection_stream.py
+++ b/dimos/perception/object_detection_stream.py
@@ -205,7 +205,7 @@ def process_frame(frame):
                             # position and rotation are already Vector objects, no need to convert
                             robot_pose = self.get_pose()
                             position, rotation = transform_robot_to_map(
-                                robot_pose, position, rotation
+                                robot_pose["position"], robot_pose["rotation"], position, rotation
                             )
                     except Exception as e:
                         logger.error(f"Error transforming to map frame: {e}")
diff --git a/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py b/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py
index 5f9032aa28..76f2ddbb0a 100644
--- a/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py
+++ b/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py
@@ -83,7 +83,7 @@ def __init__(
         self,
         min_frontier_size: int = 10,
         occupancy_threshold: int = 65,
-        subsample_resolution: int = 3,
+        subsample_resolution: int = 2,
         min_distance_from_robot: float = 0.5,
         explored_area_buffer: float = 0.5,
         min_distance_from_obstacles: float = 0.6,
diff --git a/dimos/robot/unitree/unitree_skills.py b/dimos/robot/unitree/unitree_skills.py
index d191753294..5029123ed1 100644
--- a/dimos/robot/unitree/unitree_skills.py
+++ b/dimos/robot/unitree/unitree_skills.py
@@ -51,34 +51,34 @@
         1006,
         "Recovers the robot to a state from which it can take more commands. Useful to run after multiple dynamic commands like front flips.",
     ),
-    (
-        "Euler",
-        1007,
-        "Adjusts the robot's orientation using Euler angles, providing precise control over its rotation.",
-    ),
+    # (
+    #     "Euler",
+    #     1007,
+    #     "Adjusts the robot's orientation using Euler angles, providing precise control over its rotation.",
+    # ),
     # ("Move", 1008, "Move the robot using velocity commands."),  # Intentionally omitted
     ("Sit", 1009, "Commands the robot to sit down from a standing or moving stance."),
-    (
-        "RiseSit",
-        1010,
-        "Commands the robot to rise back to a standing position from a sitting posture.",
-    ),
-    (
-        "SwitchGait",
-        1011,
-        "Switches the robot's walking pattern or style dynamically, suitable for different terrains or speeds.",
-    ),
-    ("Trigger", 1012, "Triggers a specific action or custom routine programmed into the robot."),
-    (
-        "BodyHeight",
-        1013,
-        "Adjusts the height of the robot's body from the ground, useful for navigating various obstacles.",
-    ),
-    (
-        "FootRaiseHeight",
-        1014,
-        "Controls how high the robot lifts its feet during movement, which can be adjusted for different surfaces.",
-    ),
+    # (
+    #     "RiseSit",
+    #     1010,
+    #     "Commands the robot to rise back to a standing position from a sitting posture.",
+    # ),
+    # (
+    #     "SwitchGait",
+    #     1011,
+    #     "Switches the robot's walking pattern or style dynamically, suitable for different terrains or speeds.",
+    # ),
+    # ("Trigger", 1012, "Triggers a specific action or custom routine programmed into the robot."),
+    # (
+    #     "BodyHeight",
+    #     1013,
+    #     "Adjusts the height of the robot's body from the ground, useful for navigating various obstacles.",
+    # ),
+    # (
+    #     "FootRaiseHeight",
+    #     1014,
+    #     "Controls how high the robot lifts its feet during movement, which can be adjusted for different surfaces.",
+    # ),
     (
         "SpeedLevel",
         1015,
@@ -90,16 +90,16 @@
         "Performs a greeting action, which could involve a wave or other friendly gesture.",
     ),
     ("Stretch", 1017, "Engages the robot in a stretching routine."),
-    (
-        "TrajectoryFollow",
-        1018,
-        "Directs the robot to follow a predefined trajectory, which could involve complex paths or maneuvers.",
-    ),
-    (
-        "ContinuousGait",
-        1019,
-        "Enables a mode for continuous walking or running, ideal for long-distance travel.",
-    ),
+    # (
+    #     "TrajectoryFollow",
+    #     1018,
+    #     "Directs the robot to follow a predefined trajectory, which could involve complex paths or maneuvers.",
+    # ),
+    # (
+    #     "ContinuousGait",
+    #     1019,
+    #     "Enables a mode for continuous walking or running, ideal for long-distance travel.",
+    # ),
     ("Content", 1020, "To display or trigger when the robot is happy."),
     ("Wallow", 1021, "The robot falls onto its back and rolls around."),
     (
@@ -108,18 +108,18 @@
         "Performs a predefined dance routine 1, programmed for entertainment or demonstration.",
     ),
     ("Dance2", 1023, "Performs another variant of a predefined dance routine 2."),
-    ("GetBodyHeight", 1024, "Retrieves the current height of the robot's body from the ground."),
-    (
-        "GetFootRaiseHeight",
-        1025,
-        "Retrieves the current height at which the robot's feet are being raised during movement.",
-    ),
-    ("GetSpeedLevel", 1026, "Returns the current speed level at which the robot is operating."),
-    (
-        "SwitchJoystick",
-        1027,
-        "Toggles the control mode to joystick input, allowing for manual direction of the robot's movements.",
-    ),
+    # ("GetBodyHeight", 1024, "Retrieves the current height of the robot's body from the ground."),
+    # (
+    #     "GetFootRaiseHeight",
+    #     1025,
+    #     "Retrieves the current height at which the robot's feet are being raised during movement.",
+    # ),
+    # ("GetSpeedLevel", 1026, "Returns the current speed level at which the robot is operating."),
+    # (
+    #     "SwitchJoystick",
+    #     1027,
+    #     "Toggles the control mode to joystick input, allowing for manual direction of the robot's movements.",
+    # ),
     (
         "Pose",
         1028,
@@ -137,46 +137,46 @@
         1032,
         "Initiates a pouncing movement forward, mimicking animal-like pouncing behavior.",
     ),
-    ("WiggleHips", 1033, "Causes the robot to wiggle its hips."),
-    (
-        "GetState",
-        1034,
-        "Retrieves the current operational state of the robot, including status reports or diagnostic information.",
-    ),
-    (
-        "EconomicGait",
-        1035,
-        "Engages a more energy-efficient walking or running mode to conserve battery life.",
-    ),
-    ("FingerHeart", 1036, "Performs a finger heart gesture while on its hind legs."),
-    (
-        "Handstand",
-        1301,
-        "Commands the robot to perform a handstand, demonstrating balance and control.",
-    ),
-    (
-        "CrossStep",
-        1302,
-        "Engages the robot in a cross-stepping routine, useful for complex locomotion or dance moves.",
-    ),
-    (
-        "OnesidedStep",
-        1303,
-        "Commands the robot to perform a stepping motion that predominantly uses one side.",
-    ),
-    (
-        "Bound",
-        1304,
-        "Initiates a bounding motion, similar to a light, repetitive hopping or leaping.",
-    ),
-    (
-        "LeadFollow",
-        1045,
-        "Engages follow-the-leader behavior, where the robot follows a designated leader or follows a signal.",
-    ),
-    ("LeftFlip", 1042, "Executes a flip towards the left side."),
-    ("RightFlip", 1043, "Performs a flip towards the right side."),
-    ("Backflip", 1044, "Executes a backflip, a complex and dynamic maneuver."),
+    # ("WiggleHips", 1033, "Causes the robot to wiggle its hips."),
+    # (
+    #     "GetState",
+    #     1034,
+    #     "Retrieves the current operational state of the robot, including status reports or diagnostic information.",
+    # ),
+    # (
+    #     "EconomicGait",
+    #     1035,
+    #     "Engages a more energy-efficient walking or running mode to conserve battery life.",
+    # ),
+    # ("FingerHeart", 1036, "Performs a finger heart gesture while on its hind legs."),
+    # (
+    #     "Handstand",
+    #     1301,
+    #     "Commands the robot to perform a handstand, demonstrating balance and control.",
+    # ),
+    # (
+    #     "CrossStep",
+    #     1302,
+    #     "Engages the robot in a cross-stepping routine, useful for complex locomotion or dance moves.",
+    # ),
+    # (
+    #     "OnesidedStep",
+    #     1303,
+    #     "Commands the robot to perform a stepping motion that predominantly uses one side.",
+    # ),
+    # (
+    #     "Bound",
+    #     1304,
+    #     "Initiates a bounding motion, similar to a light, repetitive hopping or leaping.",
+    # ),
+    # (
+    #     "LeadFollow",
+    #     1045,
+    #     "Engages follow-the-leader behavior, where the robot follows a designated leader or follows a signal.",
+    # ),
+    # ("LeftFlip", 1042, "Executes a flip towards the left side."),
+    # ("RightFlip", 1043, "Performs a flip towards the right side."),
+    # ("Backflip", 1044, "Executes a backflip, a complex and dynamic maneuver."),
 ]
 
 # region MyUnitreeSkills
diff --git a/dimos/robot/unitree_webrtc/unitree_skills.py b/dimos/robot/unitree_webrtc/unitree_skills.py
index e6bc4ec759..f9dfc1eede 100644
--- a/dimos/robot/unitree_webrtc/unitree_skills.py
+++ b/dimos/robot/unitree_webrtc/unitree_skills.py
@@ -50,14 +50,8 @@
     (
         "RecoveryStand",
         1006,
-        "Recovers the robot to a state from which it can take more commands. Useful to run after multiple dynamic commands like front flips.",
+        "Recovers the robot to a state from which it can take more commands. Useful to run after multiple dynamic commands like front flips, Must run after skills like sit and jump and standup.",
     ),
-    (
-        "Euler",
-        1007,
-        "Adjusts the robot's orientation using Euler angles, providing precise control over its rotation.",
-    ),
-    # ("Move", 1008, "Move the robot using velocity commands."),  # Handled separately
     ("Sit", 1009, "Commands the robot to sit down from a standing or moving stance."),
     (
         "RiseSit",
diff --git a/dimos/skills/navigation.py b/dimos/skills/navigation.py
index 114e3dacb8..6d67ae04f2 100644
--- a/dimos/skills/navigation.py
+++ b/dimos/skills/navigation.py
@@ -60,18 +60,24 @@ class NavigateWithText(AbstractRobotSkill):
 
     This skill first attempts to locate an object in the robot's camera view using vision.
     If the object is found, it navigates to it. If not, it falls back to querying the
-    semantic map for a location matching the description. For example, "Find the kitchen"
-    will first look for a kitchen in view, then check the semantic map coordinates where
-    a kitchen was previously observed.
+    semantic map for a location matching the description. For example, "Find the Teddy Bear"
+    will first look for a Teddy Bear in view, then check the semantic map coordinates where
+    a Teddy Bear was previously observed.
 
     CALL THIS SKILL FOR ONE SUBJECT AT A TIME. For example: "Go to the person wearing a blue shirt in the living room",
     you should call this skill twice, once for the person wearing a blue shirt and once for the living room.
+
+    If skip_visual_search is True, this skill will skip the visual search for the object in view.
+    This is useful if you want to navigate to a general location such as a kitchen or office.
+    For example, "Go to the kitchen" will not look for a kitchen in view, but will check the semantic map coordinates where
+    a kitchen was previously observed.
     """
 
     query: str = Field("", description="Text query to search for in the semantic map")
 
     limit: int = Field(1, description="Maximum number of results to return")
     distance: float = Field(1.0, description="Desired distance to maintain from object in meters")
+    skip_visual_search: bool = Field(False, description="Skip visual search for object in view")
     timeout: float = Field(40.0, description="Maximum time to spend navigating in seconds")
 
     def __init__(self, robot=None, **data):
@@ -362,22 +368,24 @@ def __call__(self):
 
         # First, try to find and navigate to the object in camera view
         logger.info(f"First attempting to find and navigate to visible object: '{self.query}'")
-        object_result = self._navigate_to_object()
 
-        if object_result and object_result["success"]:
-            logger.info(f"Successfully navigated to {self.query} in view")
-            return object_result
+        if not self.skip_visual_search:
+            object_result = self._navigate_to_object()
 
-        elif object_result and object_result["failure_reason"] == "Navigation":
+            if object_result and object_result["success"]:
+                logger.info(f"Successfully navigated to {self.query} in view")
+                return object_result
+
+            elif object_result and object_result["failure_reason"] == "Navigation":
+                logger.info(
+                    f"Failed to navigate to {self.query} in view: {object_result.get('error', 'Unknown error')}"
+                )
+                return object_result
+
+            # If object navigation failed, fall back to semantic map
             logger.info(
-                f"Failed to navigate to {self.query} in view: {object_result.get('error', 'Unknown error')}"
+                f"Object not found in view. Falling back to semantic map query for: '{self.query}'"
             )
-            return object_result
-
-        # If object navigation failed, fall back to semantic map
-        logger.info(
-            f"Object not found in view. Falling back to semantic map query for: '{self.query}'"
-        )
 
         return self._navigate_using_semantic_map()
 
@@ -405,12 +413,6 @@ def stop(self):
                 logger.error(f"Error disposing navigation task: {e}")
             self._navigation_disposable = None
 
-        # Clean up spatial memory if it exists
-        if hasattr(self, "_spatial_memory") and self._spatial_memory is not None:
-            logger.info("Cleaning up spatial memory")
-            self._spatial_memory.cleanup()
-            self._spatial_memory = None
-
         return "Navigate skill stopped successfully."
 
 
diff --git a/dimos/skills/observe.py b/dimos/skills/observe.py
index 844df11805..067307353a 100644
--- a/dimos/skills/observe.py
+++ b/dimos/skills/observe.py
@@ -106,14 +106,8 @@ def __call__(self):
             # Process the frame with Qwen
             response = self._process_frame_with_qwen(frame)
 
-            # Add the response to the conversation history
-            # self._agent.append_to_history(
-            #     f"Observation: {response}",
-            # )
-            response = self._agent.run_observable_query(f"Observation: {response}")
-
             logger.info(f"Added Qwen observation to conversation history")
-            return f"Observation complete: {response[:100]}..."
+            return f"Observation complete: {response}"
 
         except Exception as e:
             error_msg = f"Error in Observe skill: {e}"
@@ -127,6 +121,10 @@ def _get_frame_from_stream(self):
         Returns:
             A single frame from the video stream, or None if no frame is available
         """
+        if self._video_stream is None:
+            logger.error("Video stream is None")
+            return None
+
         frame = None
         frame_subject = rx.subject.Subject()
 
diff --git a/dimos/skills/skills.py b/dimos/skills/skills.py
index 559f9b0e36..f6c7456d24 100644
--- a/dimos/skills/skills.py
+++ b/dimos/skills/skills.py
@@ -121,28 +121,35 @@ def create_instance(self, name, **kwargs):
             print(f"Stored args for later instance creation: {name} with args: {kwargs}")
 
     def call(self, name, **args):
-        # Get the stored args if available; otherwise, use an empty dict
-        stored_args = self._instances.get(name, {})
-
-        # Merge the arguments with priority given to stored arguments
-        complete_args = {**args, **stored_args}
-
-        # Dynamically get the class from the module or current script
-        skill_class = getattr(self, name, None)
-        if skill_class is None:
-            for skill in self.get():
-                if name == skill.__name__:
-                    skill_class = skill
-                    break
-            if skill_class is None:
-                raise ValueError(f"Skill class not found: {name}")
+        try:
+            # Get the stored args if available; otherwise, use an empty dict
+            stored_args = self._instances.get(name, {})
 
-        # Initialize the instance with the merged arguments
-        instance = skill_class(**complete_args)
-        print(f"Instance created and function called for: {name} with args: {complete_args}")
+            # Merge the arguments with priority given to stored arguments
+            complete_args = {**args, **stored_args}
 
-        # Call the instance directly
-        return instance()
+            # Dynamically get the class from the module or current script
+            skill_class = getattr(self, name, None)
+            if skill_class is None:
+                for skill in self.get():
+                    if name == skill.__name__:
+                        skill_class = skill
+                        break
+                if skill_class is None:
+                    error_msg = f"Skill '{name}' is not available. Please check if it's properly registered."
+                    logger.error(f"Skill class not found: {name}")
+                    return error_msg
+
+            # Initialize the instance with the merged arguments
+            instance = skill_class(**complete_args)
+            print(f"Instance created and function called for: {name} with args: {complete_args}")
+
+            # Call the instance directly
+            return instance()
+        except Exception as e:
+            error_msg = f"Error executing skill '{name}': {str(e)}"
+            logger.error(error_msg)
+            return error_msg
 
     # ==== Tools ====
 
diff --git a/dimos/skills/unitree/__init__.py b/dimos/skills/unitree/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/dimos/skills/unitree/__init__.py
@@ -0,0 +1 @@
+
diff --git a/dimos/skills/unitree/unitree_speak.py b/dimos/skills/unitree/unitree_speak.py
new file mode 100644
index 0000000000..05004398f9
--- /dev/null
+++ b/dimos/skills/unitree/unitree_speak.py
@@ -0,0 +1,280 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dimos.skills.skills import AbstractRobotSkill
+from pydantic import Field
+import time
+import tempfile
+import os
+import json
+import base64
+import hashlib
+import soundfile as sf
+import numpy as np
+from openai import OpenAI
+from dimos.utils.logging_config import setup_logger
+from go2_webrtc_driver.constants import RTC_TOPIC
+
+logger = setup_logger("dimos.skills.unitree.unitree_speak")
+
+# Audio API constants (from go2_webrtc_driver)
+AUDIO_API = {
+    "GET_AUDIO_LIST": 1001,
+    "SELECT_START_PLAY": 1002,
+    "PAUSE": 1003,
+    "UNSUSPEND": 1004,
+    "SET_PLAY_MODE": 1007,
+    "UPLOAD_AUDIO_FILE": 2001,
+    "ENTER_MEGAPHONE": 4001,
+    "EXIT_MEGAPHONE": 4002,
+    "UPLOAD_MEGAPHONE": 4003,
+}
+
+PLAY_MODES = {"NO_CYCLE": "no_cycle", "SINGLE_CYCLE": "single_cycle", "LIST_LOOP": "list_loop"}
+
+
+class UnitreeSpeak(AbstractRobotSkill):
+    """Speak text out loud through the robot's speakers using WebRTC audio upload."""
+
+    text: str = Field(..., description="Text to speak")
+    voice: str = Field(
+        default="echo", description="Voice to use (alloy, echo, fable, onyx, nova, shimmer)"
+    )
+    speed: float = Field(default=1.2, description="Speech speed (0.25 to 4.0)")
+    use_megaphone: bool = Field(
+        default=False, description="Use megaphone mode for lower latency (experimental)"
+    )
+
+    def __init__(self, **data):
+        super().__init__(**data)
+        self._openai_client = None
+
+    def _get_openai_client(self):
+        if self._openai_client is None:
+            self._openai_client = OpenAI()
+        return self._openai_client
+
+    def _generate_audio(self, text: str) -> bytes:
+        try:
+            client = self._get_openai_client()
+            response = client.audio.speech.create(
+                model="tts-1", voice=self.voice, input=text, speed=self.speed, response_format="mp3"
+            )
+            return response.content
+        except Exception as e:
+            logger.error(f"Error generating audio: {e}")
+            raise
+
+    def _webrtc_request(self, api_id: int, parameter: dict = None):
+        if parameter is None:
+            parameter = {}
+
+        request_data = {"api_id": api_id, "parameter": json.dumps(parameter) if parameter else "{}"}
+
+        return self._robot.webrtc_connection.publish_request(
+            RTC_TOPIC["AUDIO_HUB_REQ"], request_data
+        )
+
+    def _upload_audio_to_robot(self, audio_data: bytes, filename: str) -> str:
+        try:
+            file_md5 = hashlib.md5(audio_data).hexdigest()
+            b64_data = base64.b64encode(audio_data).decode("utf-8")
+
+            chunk_size = 61440
+            chunks = [b64_data[i : i + chunk_size] for i in range(0, len(b64_data), chunk_size)]
+            total_chunks = len(chunks)
+
+            logger.info(f"Uploading audio '{filename}' in {total_chunks} chunks (optimized)")
+
+            for i, chunk in enumerate(chunks, 1):
+                parameter = {
+                    "file_name": filename,
+                    "file_type": "wav",
+                    "file_size": len(audio_data),
+                    "current_block_index": i,
+                    "total_block_number": total_chunks,
+                    "block_content": chunk,
+                    "current_block_size": len(chunk),
+                    "file_md5": file_md5,
+                    "create_time": int(time.time() * 1000),
+                }
+
+                logger.debug(f"Sending chunk {i}/{total_chunks}")
+                response = self._webrtc_request(AUDIO_API["UPLOAD_AUDIO_FILE"], parameter)
+
+            logger.info(f"Audio upload completed for '{filename}'")
+
+            list_response = self._webrtc_request(AUDIO_API["GET_AUDIO_LIST"], {})
+
+            if list_response and "data" in list_response:
+                data_str = list_response.get("data", {}).get("data", "{}")
+                audio_list = json.loads(data_str).get("audio_list", [])
+
+                for audio in audio_list:
+                    if audio.get("CUSTOM_NAME") == filename:
+                        return audio.get("UNIQUE_ID")
+
+            logger.warning(
+                f"Could not find uploaded audio '{filename}' in list, using filename as UUID"
+            )
+            return filename
+
+        except Exception as e:
+            logger.error(f"Error uploading audio to robot: {e}")
+            raise
+
+    def _play_audio_on_robot(self, uuid: str):
+        try:
+            self._webrtc_request(AUDIO_API["SET_PLAY_MODE"], {"play_mode": PLAY_MODES["NO_CYCLE"]})
+            time.sleep(0.1)
+
+            parameter = {"unique_id": uuid}
+
+            logger.info(f"Playing audio with UUID: {uuid}")
+            self._webrtc_request(AUDIO_API["SELECT_START_PLAY"], parameter)
+
+        except Exception as e:
+            logger.error(f"Error playing audio on robot: {e}")
+            raise
+
+    def _stop_audio_playback(self):
+        try:
+            logger.debug("Stopping audio playback")
+            self._webrtc_request(AUDIO_API["PAUSE"], {})
+        except Exception as e:
+            logger.warning(f"Error stopping audio playback: {e}")
+
+    def _upload_and_play_megaphone(self, audio_data: bytes, duration: float):
+        try:
+            logger.debug("Entering megaphone mode")
+            self._webrtc_request(AUDIO_API["ENTER_MEGAPHONE"], {})
+
+            time.sleep(0.2)
+
+            b64_data = base64.b64encode(audio_data).decode("utf-8")
+
+            chunk_size = 4096
+            chunks = [b64_data[i : i + chunk_size] for i in range(0, len(b64_data), chunk_size)]
+            total_chunks = len(chunks)
+
+            logger.info(f"Uploading megaphone audio in {total_chunks} chunks")
+
+            for i, chunk in enumerate(chunks, 1):
+                parameter = {
+                    "current_block_size": len(chunk),
+                    "block_content": chunk,
+                    "current_block_index": i,
+                    "total_block_number": total_chunks,
+                }
+
+                logger.debug(f"Sending megaphone chunk {i}/{total_chunks}")
+                self._webrtc_request(AUDIO_API["UPLOAD_MEGAPHONE"], parameter)
+
+                if i < total_chunks:
+                    time.sleep(0.05)
+
+            logger.info("Megaphone audio upload completed, waiting for playback")
+
+            time.sleep(duration + 1.0)
+
+        except Exception as e:
+            logger.error(f"Error in megaphone mode: {e}")
+            try:
+                self._webrtc_request(AUDIO_API["EXIT_MEGAPHONE"], {})
+            except:
+                pass
+            raise
+        finally:
+            try:
+                logger.debug("Exiting megaphone mode")
+                self._webrtc_request(AUDIO_API["EXIT_MEGAPHONE"], {})
+                time.sleep(0.1)
+            except Exception as e:
+                logger.warning(f"Error exiting megaphone mode: {e}")
+
+    def __call__(self):
+        super().__call__()
+
+        if not self._robot:
+            logger.error("No robot instance provided to UnitreeSpeak skill")
+            return "Error: No robot instance available"
+
+        try:
+            display_text = self.text[:50] + "..." if len(self.text) > 50 else self.text
+            logger.info(f"Speaking: '{display_text}'")
+
+            logger.debug("Generating audio with OpenAI TTS")
+            audio_data = self._generate_audio(self.text)
+
+            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp_mp3:
+                tmp_mp3.write(audio_data)
+                tmp_mp3_path = tmp_mp3.name
+
+            try:
+                audio_array, sample_rate = sf.read(tmp_mp3_path)
+
+                if audio_array.ndim > 1:
+                    audio_array = np.mean(audio_array, axis=1)
+
+                target_sample_rate = 22050
+                if sample_rate != target_sample_rate:
+                    logger.debug(f"Resampling from {sample_rate}Hz to {target_sample_rate}Hz")
+                    old_length = len(audio_array)
+                    new_length = int(old_length * target_sample_rate / sample_rate)
+                    old_indices = np.arange(old_length)
+                    new_indices = np.linspace(0, old_length - 1, new_length)
+                    audio_array = np.interp(new_indices, old_indices, audio_array)
+                    sample_rate = target_sample_rate
+
+                audio_array = audio_array / np.max(np.abs(audio_array))
+
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
+                    sf.write(tmp_wav.name, audio_array, sample_rate, format="WAV", subtype="PCM_16")
+                    tmp_wav.seek(0)
+                    wav_data = open(tmp_wav.name, "rb").read()
+                    os.unlink(tmp_wav.name)
+
+                logger.info(
+                    f"Audio size: {len(wav_data) / 1024:.1f}KB, duration: {len(audio_array) / sample_rate:.1f}s"
+                )
+
+            finally:
+                os.unlink(tmp_mp3_path)
+
+            if self.use_megaphone:
+                logger.debug("Using megaphone mode for lower latency")
+                duration = len(audio_array) / sample_rate
+                self._upload_and_play_megaphone(wav_data, duration)
+
+                return f"Spoke: '{display_text}' on robot successfully (megaphone mode)"
+            else:
+                filename = f"speak_{int(time.time() * 1000)}"
+
+                logger.debug("Uploading audio to robot")
+                uuid = self._upload_audio_to_robot(wav_data, filename)
+
+                logger.debug("Playing audio on robot")
+                self._play_audio_on_robot(uuid)
+
+                duration = len(audio_array) / sample_rate
+                logger.debug(f"Waiting {duration:.1f}s for playback to complete")
+                # time.sleep(duration + 0.2)
+
+                # self._stop_audio_playback()
+
+                return f"Spoke: '{display_text}' on robot successfully"
+
+        except Exception as e:
+            logger.error(f"Error in speak skill: {e}")
+            return f"Error speaking text: {str(e)}"
diff --git a/dimos/web/dimos_interface/api/server.py b/dimos/web/dimos_interface/api/server.py
index 812973e300..bcc590ab46 100644
--- a/dimos/web/dimos_interface/api/server.py
+++ b/dimos/web/dimos_interface/api/server.py
@@ -27,7 +27,7 @@
 # Fast Api & Uvicorn
 import cv2
 from dimos.web.edge_io import EdgeIO
-from fastapi import FastAPI, Request, Form, HTTPException
+from fastapi import FastAPI, Request, Form, HTTPException, UploadFile, File
 from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse
 from sse_starlette.sse import EventSourceResponse
 from fastapi.templating import Jinja2Templates
@@ -42,6 +42,14 @@
 import reactivex as rx
 from fastapi.middleware.cors import CORSMiddleware
 
+# For audio processing
+import io
+import time
+import numpy as np
+import ffmpeg
+import soundfile as sf
+from dimos.stream.audio.base import AudioEvent
+
 # TODO: Resolve threading, start/stop stream functionality.
 
 
@@ -53,6 +61,7 @@ def __init__(
         host="0.0.0.0",
         port=5555,
         text_streams=None,
+        audio_subject=None,
         **streams,
     ):
         print("Starting FastAPIServer initialization...")  # Debug print
@@ -88,6 +97,7 @@ def __init__(
         # Create a Subject for text queries
         self.query_subject = rx.subject.Subject()
         self.query_stream = self.query_subject.pipe(ops.share())
+        self.audio_subject = audio_subject
 
         for key in self.streams:
             if self.streams[key] is not None:
@@ -198,6 +208,33 @@ async def text_stream_generator(self, key):
         finally:
             self.text_clients.remove(client_id)
 
+    @staticmethod
+    def _decode_audio(raw: bytes) -> tuple[np.ndarray, int]:
+        """Convert the webm/opus blob sent by the browser into mono 16-kHz PCM."""
+        try:
+            # Use ffmpeg to convert to 16-kHz mono 16-bit PCM WAV in memory
+            out, _ = (
+                ffmpeg.input("pipe:0")
+                .output(
+                    "pipe:1",
+                    format="wav",
+                    acodec="pcm_s16le",
+                    ac=1,
+                    ar="16000",
+                    loglevel="quiet",
+                )
+                .run(input=raw, capture_stdout=True, capture_stderr=True)
+            )
+            # Load with soundfile (returns float32 by default)
+            audio, sr = sf.read(io.BytesIO(out), dtype="float32")
+            # Ensure 1-D array (mono)
+            if audio.ndim > 1:
+                audio = audio[:, 0]
+            return np.array(audio), sr
+        except Exception as exc:
+            print(f"ffmpeg decoding failed: {exc}")
+            return None, None
+
     def setup_routes(self):
         """Set up FastAPI routes."""
 
@@ -221,6 +258,7 @@ async def index(request: Request):
                     "request": request,
                     "stream_keys": stream_keys,
                     "text_stream_keys": text_stream_keys,
+                    "has_voice": self.audio_subject is not None,
                 },
             )
 
@@ -240,6 +278,39 @@ async def submit_query(query: str = Form(...)):
                     content={"success": False, "message": f"Server error: {str(e)}"},
                 )
 
+        @self.app.post("/upload_audio")
+        async def upload_audio(file: UploadFile = File(...)):
+            """Handle audio upload from the browser."""
+            if self.audio_subject is None:
+                return JSONResponse(
+                    status_code=400,
+                    content={"success": False, "message": "Voice input not configured"},
+                )
+
+            try:
+                data = await file.read()
+                audio_np, sr = self._decode_audio(data)
+                if audio_np is None:
+                    return JSONResponse(
+                        status_code=400,
+                        content={"success": False, "message": "Unable to decode audio"},
+                    )
+
+                event = AudioEvent(
+                    data=audio_np,
+                    sample_rate=sr,
+                    timestamp=time.time(),
+                    channels=1 if audio_np.ndim == 1 else audio_np.shape[1],
+                )
+
+                # Push to reactive stream
+                self.audio_subject.on_next(event)
+                print(f"Received audio – {event.data.shape[0] / sr:.2f} s, {sr} Hz")
+                return {"success": True}
+            except Exception as e:
+                print(f"Failed to process uploaded audio: {e}")
+                return JSONResponse(status_code=500, content={"success": False, "message": str(e)})
+
         # Unitree API endpoints
         @self.app.get("/unitree/status")
         async def unitree_status():
diff --git a/dimos/web/dimos_interface/api/templates/index_fastapi.html b/dimos/web/dimos_interface/api/templates/index_fastapi.html
index 94cf0be328..406557c04a 100644
--- a/dimos/web/dimos_interface/api/templates/index_fastapi.html
+++ b/dimos/web/dimos_interface/api/templates/index_fastapi.html
@@ -130,6 +130,7 @@
         .query-form {
             display: flex;
             gap: 10px;
+            align-items: center;
         }
         
         .query-input {
@@ -155,6 +156,81 @@
             background-color: #218838;
         }
         
+        /* Voice button styles */
+        .voice-button {
+            width: 50px;
+            height: 50px;
+            border-radius: 50%;
+            background-color: #dc3545;
+            color: white;
+            border: none;
+            cursor: pointer;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            font-size: 24px;
+            transition: all 0.3s ease;
+            box-shadow: 0 2px 5px rgba(0,0,0,0.2);
+            position: relative;
+        }
+        
+        .voice-button:hover {
+            transform: scale(1.1);
+            box-shadow: 0 4px 8px rgba(0,0,0,0.3);
+        }
+        
+        .voice-button.recording {
+            background-color: #ff0000;
+            animation: pulse 1.5s infinite;
+        }
+        
+        .voice-button.recording::after {
+            content: '';
+            position: absolute;
+            top: -10px;
+            left: -10px;
+            right: -10px;
+            bottom: -10px;
+            border: 3px solid rgba(255, 0, 0, 0.5);
+            border-radius: 50%;
+            animation: ripple 1.5s infinite;
+        }
+        
+        @keyframes pulse {
+            0% { transform: scale(1); }
+            50% { transform: scale(1.05); }
+            100% { transform: scale(1); }
+        }
+        
+        @keyframes ripple {
+            0% {
+                transform: scale(1);
+                opacity: 1;
+            }
+            100% {
+                transform: scale(1.2);
+                opacity: 0;
+            }
+        }
+        
+        .voice-status {
+            position: absolute;
+            top: -25px;
+            left: 50%;
+            transform: translateX(-50%);
+            background-color: rgba(0,0,0,0.8);
+            color: white;
+            padding: 4px 8px;
+            border-radius: 4px;
+            font-size: 12px;
+            white-space: nowrap;
+            display: none;
+        }
+        
+        .voice-button.recording .voice-status {
+            display: block;
+        }
+        
         .query-response {
             margin-top: 15px;
             padding: 10px;
@@ -218,6 +294,12 @@ <h2>Ask a Question</h2>
         <form id="queryForm" class="query-form">
             <input type="text" id="queryInput" class="query-input" placeholder="Type your query here..." required>
             <button type="submit" class="query-button">Send</button>
+            {% if has_voice %}
+            <button type="button" id="voiceButton" class="voice-button">
+                <span class="voice-status">Recording...</span>
+                🎤
+            </button>
+            {% endif %}
         </form>
         <div id="queryResponse" class="query-response"></div>
     </div>
@@ -277,6 +359,72 @@ <h2>{{ key.replace('_', ' ').title() }}</h2>
             window.location.reload(true);
         }
 
+        // Voice recording functionality
+        {% if has_voice %}
+        let mediaRecorder;
+        let chunks = [];
+        const voiceBtn = document.getElementById('voiceButton');
+        const queryResponse = document.getElementById('queryResponse');
+
+        voiceBtn.addEventListener('click', async () => {
+            if (mediaRecorder && mediaRecorder.state === 'recording') {
+                // Stop recording
+                mediaRecorder.stop();
+                voiceBtn.classList.remove('recording');
+            } else {
+                // Start recording
+                try {
+                    if (!mediaRecorder) {
+                        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+                        mediaRecorder = new MediaRecorder(stream);
+                        
+                        mediaRecorder.ondataavailable = e => chunks.push(e.data);
+                        
+                        mediaRecorder.onstop = async () => {
+                            const blob = new Blob(chunks, { type: 'audio/webm' });
+                            chunks = [];
+                            
+                            // Show uploading status
+                            queryResponse.textContent = 'Processing voice command...';
+                            queryResponse.className = 'query-response';
+                            queryResponse.style.display = 'block';
+                            
+                            const formData = new FormData();
+                            formData.append('file', blob, 'recording.webm');
+                            
+                            try {
+                                const res = await fetch('/upload_audio', { 
+                                    method: 'POST', 
+                                    body: formData 
+                                });
+                                const json = await res.json();
+                                
+                                if (json.success) {
+                                    queryResponse.textContent = 'Voice command received!';
+                                    queryResponse.className = 'query-response success';
+                                    setTimeout(() => {
+                                        queryResponse.style.display = 'none';
+                                    }, 3000);
+                                } else {
+                                    queryResponse.textContent = 'Error: ' + json.message;
+                                    queryResponse.className = 'query-response error';
+                                }
+                            } catch (err) {
+                                queryResponse.textContent = 'Upload failed: ' + err.message;
+                                queryResponse.className = 'query-response error';
+                            }
+                        };
+                    }
+                    
+                    mediaRecorder.start();
+                    voiceBtn.classList.add('recording');
+                } catch (err) {
+                    alert('Microphone access denied. Please allow microphone access to use voice commands.');
+                }
+            }
+        });
+        {% endif %}
+
         // Handle query form submission
         document.getElementById('queryForm').addEventListener('submit', async function(e) {
             e.preventDefault();
diff --git a/dimos/web/dimos_interface/src/App.svelte b/dimos/web/dimos_interface/src/App.svelte
index a1ddae4931..c249f3e3ea 100644
--- a/dimos/web/dimos_interface/src/App.svelte
+++ b/dimos/web/dimos_interface/src/App.svelte
@@ -3,7 +3,27 @@
   import Input from './components/Input.svelte';
   import History from './components/History.svelte';
   import StreamViewer from './components/StreamViewer.svelte';
+  import VoiceButton from './components/VoiceButton.svelte';
   import { theme } from './stores/theme';
+  import { history } from './stores/history';
+
+  const handleVoiceCommand = async (event: CustomEvent) => {
+    if (event.detail.success) {
+      // Show voice processing message
+      history.update(h => [...h, { 
+        command: '[voice command]', 
+        outputs: ['Processing voice command...'] 
+      }]);
+      
+      // The actual command will be processed by the agent through the audio pipeline
+      // and will appear in the text stream
+    } else {
+      history.update(h => [...h, { 
+        command: '[voice command]', 
+        outputs: [`Error: ${event.detail.error}`] 
+      }]);
+    }
+  };
 </script>
 
 <svelte:head>
@@ -29,3 +49,5 @@
     <Input />
   </div>
 </main>
+
+<VoiceButton on:voiceCommand={handleVoiceCommand} />
diff --git a/dimos/web/dimos_interface/src/components/Input.svelte b/dimos/web/dimos_interface/src/components/Input.svelte
index 86eba9184d..3a2b515f3d 100644
--- a/dimos/web/dimos_interface/src/components/Input.svelte
+++ b/dimos/web/dimos_interface/src/components/Input.svelte
@@ -29,6 +29,32 @@
 
   const handleKeyDown = async (event: KeyboardEvent) => {
     if (event.key === 'Enter') {
+      await executeCommand();
+    } else if (event.key === 'ArrowUp') {
+      if (historyIndex < $history.length - 1) {
+        historyIndex++;
+        command = $history[$history.length - 1 - historyIndex].command;
+      }
+      event.preventDefault();
+    } else if (event.key === 'ArrowDown') {
+      if (historyIndex > -1) {
+        historyIndex--;
+        command = historyIndex >= 0 ? $history[$history.length - 1 - historyIndex].command : '';
+      }
+      event.preventDefault();
+    } else if (event.key === 'Tab') {
+      event.preventDefault();
+      const autoCompleteCommand = Object.keys(commands).find((cmd) => cmd.startsWith(command));
+      if (autoCompleteCommand) {
+        command = autoCompleteCommand;
+      }
+    } else if (event.ctrlKey && event.key === 'l') {
+      event.preventDefault();
+      $history = [];
+    }
+  };
+
+  const executeCommand = async () => {
       const [commandName, ...args] = command.split(' ');
 
       if (import.meta.env.VITE_TRACKING_ENABLED === 'true') {
@@ -57,28 +83,6 @@
 
       command = '';
       historyIndex = -1;
-    } else if (event.key === 'ArrowUp') {
-      if (historyIndex < $history.length - 1) {
-        historyIndex++;
-        command = $history[$history.length - 1 - historyIndex].command;
-      }
-      event.preventDefault();
-    } else if (event.key === 'ArrowDown') {
-      if (historyIndex > -1) {
-        historyIndex--;
-        command = historyIndex >= 0 ? $history[$history.length - 1 - historyIndex].command : '';
-      }
-      event.preventDefault();
-    } else if (event.key === 'Tab') {
-      event.preventDefault();
-      const autoCompleteCommand = Object.keys(commands).find((cmd) => cmd.startsWith(command));
-      if (autoCompleteCommand) {
-        command = autoCompleteCommand;
-      }
-    } else if (event.ctrlKey && event.key === 'l') {
-      event.preventDefault();
-      $history = [];
-    }
   };
 </script>
 
diff --git a/dimos/web/dimos_interface/src/components/VoiceButton.svelte b/dimos/web/dimos_interface/src/components/VoiceButton.svelte
new file mode 100644
index 0000000000..0f9682519a
--- /dev/null
+++ b/dimos/web/dimos_interface/src/components/VoiceButton.svelte
@@ -0,0 +1,262 @@
+<!--
+ Copyright 2025 Dimensional Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<script lang="ts">
+  import { createEventDispatcher } from 'svelte';
+  import { theme } from '../stores/theme';
+  import { connectTextStream } from '../stores/stream';
+
+  const dispatch = createEventDispatcher();
+
+  // Get the server URL dynamically based on current location
+  const getServerUrl = () => {
+    // In production, use the same host as the frontend but on port 5555
+    const hostname = window.location.hostname;
+    return `http://${hostname}:5555`;
+  };
+
+  let isRecording = false;
+  let mediaRecorder: MediaRecorder | null = null;
+  let chunks: Blob[] = [];
+  let isProcessing = false;
+
+  async function toggleRecording() {
+    if (isRecording && mediaRecorder) {
+      // Stop recording
+      mediaRecorder.stop();
+      isRecording = false;
+    } else {
+      // Start recording
+      try {
+        if (!mediaRecorder) {
+          const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+          mediaRecorder = new MediaRecorder(stream);
+          
+          mediaRecorder.ondataavailable = (e) => chunks.push(e.data);
+          
+          mediaRecorder.onstop = async () => {
+            isProcessing = true;
+            const blob = new Blob(chunks, { type: 'audio/webm' });
+            chunks = [];
+            
+            // Upload to backend
+            const formData = new FormData();
+            formData.append('file', blob, 'recording.webm');
+            
+            try {
+              const res = await fetch(`${getServerUrl()}/upload_audio`, {
+                method: 'POST',
+                body: formData
+              });
+              
+              const json = await res.json();
+              
+              if (json.success) {
+                // Connect to agent_responses stream to see the output
+                connectTextStream('agent_responses');
+                dispatch('voiceCommand', { success: true });
+              } else {
+                dispatch('voiceCommand', { 
+                  success: false, 
+                  error: json.message 
+                });
+              }
+            } catch (err) {
+              dispatch('voiceCommand', { 
+                success: false, 
+                error: err instanceof Error ? err.message : 'Upload failed' 
+              });
+            } finally {
+              isProcessing = false;
+            }
+          };
+        }
+        
+        mediaRecorder.start();
+        isRecording = true;
+      } catch (err) {
+        dispatch('voiceCommand', { 
+          success: false, 
+          error: 'Microphone access denied' 
+        });
+      }
+    }
+  }
+
+  // Keyboard shortcut support
+  function handleKeyPress(event: KeyboardEvent) {
+    // Ctrl+M or Cmd+M to toggle recording
+    if ((event.ctrlKey || event.metaKey) && event.key === 'm') {
+      event.preventDefault();
+      toggleRecording();
+    }
+  }
+</script>
+
+<svelte:window on:keydown={handleKeyPress} />
+
+<button
+  class="voice-button-fab"
+  class:recording={isRecording}
+  class:processing={isProcessing}
+  on:click={toggleRecording}
+  disabled={isProcessing}
+  style="--theme-color: {$theme.primary}"
+  title={isRecording ? 'Stop recording (Ctrl+M)' : 'Start voice command (Ctrl+M)'}
+>
+  {#if isProcessing}
+    <span class="processing-icon">⟳</span>
+  {:else if isRecording}
+    <span class="mic-icon recording">◉</span>
+  {:else}
+    <span class="mic-icon">🎤</span>
+  {/if}
+</button>
+
+<style>
+  .voice-button-fab {
+    position: fixed;
+    bottom: 30px;
+    right: 30px;
+    width: 120px; /* Increased from 60px to 2x */
+    height: 120px; /* Increased from 60px to 2x */
+    border-radius: 50%;
+    background: #000;
+    border: 2px solid var(--theme-color);
+    color: var(--theme-color);
+    cursor: pointer;
+    font-size: 48px; /* Increased from 24px to 2x */
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    transition: all 0.3s ease;
+    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.5);
+    z-index: 1000;
+  }
+
+  .voice-button-fab:hover:not(:disabled) {
+    background: var(--theme-color);
+    color: #000;
+    box-shadow: 0 6px 20px var(--theme-color);
+    transform: scale(1.1);
+  }
+
+  .voice-button-fab:active:not(:disabled) {
+    transform: scale(0.95);
+  }
+
+  .voice-button-fab:disabled {
+    opacity: 0.5;
+    cursor: not-allowed;
+  }
+
+  .voice-button-fab.recording {
+    animation: pulse 1.5s infinite;
+    border-color: #ff0000;
+    color: #ff0000;
+    background: rgba(255, 0, 0, 0.1);
+  }
+
+  .voice-button-fab.recording:hover {
+    background: #ff0000;
+    color: #000;
+  }
+
+  .voice-button-fab.processing {
+    border-style: dashed;
+  }
+
+  .mic-icon {
+    display: inline-block;
+    transition: transform 0.2s ease;
+    font-size: 56px; /* Increased from 28px to 2x */
+  }
+
+  .mic-icon.recording {
+    color: #ff0000;
+    animation: blink 1s infinite;
+  }
+
+  .processing-icon {
+    display: inline-block;
+    animation: spin 1s linear infinite;
+    font-size: 56px; /* Increased from 28px to 2x */
+  }
+
+  @keyframes pulse {
+    0% { 
+      transform: scale(1);
+      box-shadow: 0 4px 12px rgba(255, 0, 0, 0.4);
+    }
+    50% { 
+      transform: scale(1.05);
+      box-shadow: 0 4px 20px rgba(255, 0, 0, 0.6);
+    }
+    100% { 
+      transform: scale(1);
+      box-shadow: 0 4px 12px rgba(255, 0, 0, 0.4);
+    }
+  }
+
+  @keyframes blink {
+    0%, 100% { opacity: 1; }
+    50% { opacity: 0.5; }
+  }
+
+  @keyframes spin {
+    from { transform: rotate(0deg); }
+    to { transform: rotate(360deg); }
+  }
+
+  /* Terminal-style glow effect */
+  .voice-button-fab.recording::after {
+    content: '';
+    position: absolute;
+    top: -8px;
+    left: -8px;
+    right: -8px;
+    bottom: -8px;
+    border: 2px solid rgba(255, 0, 0, 0.5);
+    border-radius: 50%;
+    animation: ripple 1.5s infinite;
+    pointer-events: none;
+  }
+
+  @keyframes ripple {
+    0% {
+      transform: scale(1);
+      opacity: 0.8;
+    }
+    100% {
+      transform: scale(1.3);
+      opacity: 0;
+    }
+  }
+
+  /* Mobile responsive - slightly smaller on mobile */
+  @media (max-width: 640px) {
+    .voice-button-fab {
+      width: 100px; /* Increased from 50px to 2x */
+      height: 100px; /* Increased from 50px to 2x */
+      bottom: 20px;
+      right: 20px;
+    }
+
+    .mic-icon, .processing-icon {
+      font-size: 44px; /* Increased from 22px to 2x */
+    }
+  }
+</style> 
\ No newline at end of file
diff --git a/dimos/web/dimos_interface/src/stores/stream.ts b/dimos/web/dimos_interface/src/stores/stream.ts
index 4ea07a71d7..eee46f84bf 100644
--- a/dimos/web/dimos_interface/src/stores/stream.ts
+++ b/dimos/web/dimos_interface/src/stores/stream.ts
@@ -18,6 +18,13 @@ import { writable, derived, get } from 'svelte/store';
 import { simulationManager, simulationStore } from '../utils/simulation';
 import { history } from './history';
 
+// Get the server URL dynamically based on current location
+const getServerUrl = () => {
+  // In production, use the same host as the frontend but on port 5555
+  const hostname = window.location.hostname;
+  return `http://${hostname}:5555`;
+};
+
 interface StreamState {
   isVisible: boolean;
   url: string | null;
@@ -65,7 +72,7 @@ export const combinedStreamState = derived(
 // Function to fetch available streams
 async function fetchAvailableStreams(): Promise<string[]> {
   try {
-    const response = await fetch('http://0.0.0.0:5555/streams', {
+    const response = await fetch(`${getServerUrl()}/streams`, {
       headers: {
         'Accept': 'application/json'
       }
@@ -100,7 +107,7 @@ export const showStream = async (streamKey?: string) => {
 
     streamStore.set({
       isVisible: true,
-      url: 'http://0.0.0.0:5555',
+      url: getServerUrl(),
       streamKeys: selectedStreams,
       isLoading: false,
       error: null,
@@ -134,7 +141,7 @@ export const connectTextStream = (key: string): void => {
   }
 
   // Create new EventSource
-  const eventSource = new EventSource(`http://0.0.0.0:5555/text_stream/${key}`);
+  const eventSource = new EventSource(`${getServerUrl()}/text_stream/${key}`);
   textEventSources[key] = eventSource;
   // Handle incoming messages
   eventSource.addEventListener('message', (event) => {
diff --git a/dimos/web/dimos_interface/vite.config.ts b/dimos/web/dimos_interface/vite.config.ts
index 296050256a..29be79dd4a 100644
--- a/dimos/web/dimos_interface/vite.config.ts
+++ b/dimos/web/dimos_interface/vite.config.ts
@@ -22,6 +22,7 @@ export default defineConfig({
   plugins: [svelte()],
   server: {
     port: 3000,
+    host: '0.0.0.0',
     watch: {
       // Exclude node_modules, .git and other large directories
       ignored: ['**/node_modules/**', '**/.git/**', '**/dist/**', 'lambda/**'],
diff --git a/dimos/web/robot_web_interface.py b/dimos/web/robot_web_interface.py
index 72dcce9d29..33847c0056 100644
--- a/dimos/web/robot_web_interface.py
+++ b/dimos/web/robot_web_interface.py
@@ -23,12 +23,13 @@
 class RobotWebInterface(FastAPIServer):
     """Wrapper class for the dimos-interface FastAPI server."""
 
-    def __init__(self, port=5555, text_streams=None, **streams):
+    def __init__(self, port=5555, text_streams=None, audio_subject=None, **streams):
         super().__init__(
             dev_name="Robot Web Interface",
             edge_type="Bidirectional",
             host="0.0.0.0",
             port=port,
             text_streams=text_streams,
+            audio_subject=audio_subject,
             **streams,
         )
diff --git a/tests/run.py b/tests/run.py
index 8ddb2261e0..9ae6f81398 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -36,7 +36,7 @@
 import threading
 import json
 from dimos.types.vector import Vector
-from dimos.skills.speak import Speak
+from dimos.skills.unitree.unitree_speak import UnitreeSpeak
 
 from dimos.perception.object_detection_stream import ObjectDetectionStream
 from dimos.perception.detection2d.detic_2d_det import Detic2DDetector
@@ -211,37 +211,35 @@ def newmap(msg):
 agent_response_subject = rx.subject.Subject()
 agent_response_stream = agent_response_subject.pipe(ops.share())
 local_planner_viz_stream = robot.local_planner_viz_stream.pipe(ops.share())
+audio_subject = rx.subject.Subject()
 
 # Initialize object detection stream
 min_confidence = 0.6
 class_filter = None  # No class filtering
-min_confidence = 0.99  # temporarily disable detections
-# detector = Detic2DDetector(vocabulary=None, threshold=min_confidence)
 
 # Create video stream from robot's camera
-video_stream = robot.get_video_stream()  # WebRTC doesn't use ROS video stream
+video_stream = backpressure(robot.get_video_stream())  # WebRTC doesn't use ROS video stream
 
 # # Initialize ObjectDetectionStream with robot
-# object_detector = ObjectDetectionStream(
-#     camera_intrinsics=robot.camera_intrinsics,
-#     min_confidence=min_confidence,
-#     class_filter=class_filter,
-#     get_pose=robot.get_pose,
-#     detector=detector,
-#     video_stream=video_stream,
-# )
+object_detector = ObjectDetectionStream(
+    camera_intrinsics=robot.camera_intrinsics,
+    class_filter=class_filter,
+    get_pose=robot.get_pose,
+    video_stream=video_stream,
+    draw_masks=True,
+)
 
 # # Create visualization stream for web interface
-# viz_stream = backpressure(object_detector.get_stream()).pipe(
-#     ops.share(),
-#     ops.map(lambda x: x["viz_frame"] if x is not None else None),
-#     ops.filter(lambda x: x is not None),
-# )
+viz_stream = backpressure(object_detector.get_stream()).pipe(
+    ops.share(),
+    ops.map(lambda x: x["viz_frame"] if x is not None else None),
+    ops.filter(lambda x: x is not None),
+)
 
 # # Get the formatted detection stream
-# formatted_detection_stream = object_detector.get_formatted_stream().pipe(
-#     ops.filter(lambda x: x is not None)
-# )
+formatted_detection_stream = object_detector.get_formatted_stream().pipe(
+    ops.filter(lambda x: x is not None)
+)
 
 
 # Create a direct mapping that combines detection data with locations
@@ -272,20 +270,23 @@ def combine_with_locations(object_detections):
 
 
 # Create the combined stream with a simple pipe operation
-# enhanced_data_stream = formatted_detection_stream.pipe(ops.map(combine_with_locations), ops.share())
+enhanced_data_stream = formatted_detection_stream.pipe(ops.map(combine_with_locations), ops.share())
 
 streams = {
     "unitree_video": robot.get_video_stream(),  # Changed from get_ros_video_stream to get_video_stream for WebRTC
     "local_planner_viz": local_planner_viz_stream,
-    # "object_detection": viz_stream,  # Uncommented object detection
+    "object_detection": viz_stream,  # Uncommented object detection
 }
 text_streams = {
     "agent_responses": agent_response_stream,
 }
 
-web_interface = RobotWebInterface(port=5555, text_streams=text_streams, **streams)
+web_interface = RobotWebInterface(
+    port=5555, text_streams=text_streams, audio_subject=audio_subject, **streams
+)
 
-# stt_node = stt()
+stt_node = stt()
+stt_node.consume_audio(audio_subject.pipe(ops.share()))
 
 # Read system query from prompt.txt file
 with open(
@@ -296,12 +297,15 @@ def combine_with_locations(object_detections):
 # Create a ClaudeAgent instance
 agent = ClaudeAgent(
     dev_name="test_agent",
-    # input_query_stream=stt_node.emit_text(),
-    input_query_stream=web_interface.query_stream,
+    input_query_stream=stt_node.emit_text(),
+    # input_query_stream=web_interface.query_stream,
+    input_data_stream=enhanced_data_stream,
     skills=robot.get_skills(),
     system_query=system_query,
-    model_name="claude-3-7-sonnet-latest",
+    model_name="claude-3-5-haiku-latest",
     thinking_budget_tokens=0,
+    max_output_tokens_per_request=8192,
+    # model_name="llama-4-scout-17b-16e-instruct",
 )
 
 # tts_node = tts()
@@ -312,9 +316,9 @@ def combine_with_locations(object_detections):
 robot_skills.add(Observe)
 robot_skills.add(KillSkill)
 robot_skills.add(NavigateWithText)
-robot_skills.add(FollowHuman)
+# robot_skills.add(FollowHuman) # TODO: broken
 robot_skills.add(GetPose)
-# robot_skills.add(Speak)
+robot_skills.add(UnitreeSpeak)  # Re-enable Speak skill
 robot_skills.add(NavigateToGoal)
 robot_skills.add(Explore)
 
@@ -322,11 +326,11 @@ def combine_with_locations(object_detections):
 robot_skills.create_instance("Observe", robot=robot, agent=agent)
 robot_skills.create_instance("KillSkill", robot=robot, skill_library=robot_skills)
 robot_skills.create_instance("NavigateWithText", robot=robot)
-robot_skills.create_instance("FollowHuman", robot=robot)
+# robot_skills.create_instance("FollowHuman", robot=robot)
 robot_skills.create_instance("GetPose", robot=robot)
 robot_skills.create_instance("NavigateToGoal", robot=robot)
 robot_skills.create_instance("Explore", robot=robot)
-# robot_skills.create_instance("Speak", tts_node=tts_node)
+robot_skills.create_instance("UnitreeSpeak", robot=robot)  # Now only needs robot instance
 
 # Subscribe to agent responses and send them to the subject
 agent.get_response_observable().subscribe(lambda x: agent_response_subject.on_next(x))