From 582fa32b2eb30eebc28e4447a93fcdeb8cc5ba92 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Thu, 10 Jul 2025 02:57:53 -0700
Subject: [PATCH 01/18] Cerebras added back and working

---
 tests/run.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/run.py b/tests/run.py
index 8ddb2261e0..6f7fbf9d0c 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -294,14 +294,15 @@ def combine_with_locations(object_detections):
     system_query = f.read()
 
 # Create a ClaudeAgent instance
-agent = ClaudeAgent(
+agent = CerebrasAgent(
     dev_name="test_agent",
     # input_query_stream=stt_node.emit_text(),
     input_query_stream=web_interface.query_stream,
     skills=robot.get_skills(),
     system_query=system_query,
-    model_name="claude-3-7-sonnet-latest",
-    thinking_budget_tokens=0,
+    # model_name="claude-3-7-sonnet-latest",
+    # thinking_budget_tokens=0,
+    model_name="llama-4-scout-17b-16e-instruct",
 )
 
 # tts_node = tts()

From 69f5111ed4b15149b53dcf20b9b7dac000a93866 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Thu, 10 Jul 2025 03:11:31 -0700
Subject: [PATCH 02/18] Removed non-active unitree skills

---
 dimos/robot/unitree/unitree_skills.py | 176 +++++++++++++-------------
 1 file changed, 88 insertions(+), 88 deletions(-)

diff --git a/dimos/robot/unitree/unitree_skills.py b/dimos/robot/unitree/unitree_skills.py
index d191753294..5029123ed1 100644
--- a/dimos/robot/unitree/unitree_skills.py
+++ b/dimos/robot/unitree/unitree_skills.py
@@ -51,34 +51,34 @@
         1006,
         "Recovers the robot to a state from which it can take more commands. Useful to run after multiple dynamic commands like front flips.",
     ),
-    (
-        "Euler",
-        1007,
-        "Adjusts the robot's orientation using Euler angles, providing precise control over its rotation.",
-    ),
+    # (
+    #     "Euler",
+    #     1007,
+    #     "Adjusts the robot's orientation using Euler angles, providing precise control over its rotation.",
+    # ),
     # ("Move", 1008, "Move the robot using velocity commands."),  # Intentionally omitted
     ("Sit", 1009, "Commands the robot to sit down from a standing or moving stance."),
-    (
-        "RiseSit",
-        1010,
-        "Commands the robot to rise back to a standing position from a sitting posture.",
-    ),
-    (
-        "SwitchGait",
-        1011,
-        "Switches the robot's walking pattern or style dynamically, suitable for different terrains or speeds.",
-    ),
-    ("Trigger", 1012, "Triggers a specific action or custom routine programmed into the robot."),
-    (
-        "BodyHeight",
-        1013,
-        "Adjusts the height of the robot's body from the ground, useful for navigating various obstacles.",
-    ),
-    (
-        "FootRaiseHeight",
-        1014,
-        "Controls how high the robot lifts its feet during movement, which can be adjusted for different surfaces.",
-    ),
+    # (
+    #     "RiseSit",
+    #     1010,
+    #     "Commands the robot to rise back to a standing position from a sitting posture.",
+    # ),
+    # (
+    #     "SwitchGait",
+    #     1011,
+    #     "Switches the robot's walking pattern or style dynamically, suitable for different terrains or speeds.",
+    # ),
+    # ("Trigger", 1012, "Triggers a specific action or custom routine programmed into the robot."),
+    # (
+    #     "BodyHeight",
+    #     1013,
+    #     "Adjusts the height of the robot's body from the ground, useful for navigating various obstacles.",
+    # ),
+    # (
+    #     "FootRaiseHeight",
+    #     1014,
+    #     "Controls how high the robot lifts its feet during movement, which can be adjusted for different surfaces.",
+    # ),
     (
         "SpeedLevel",
         1015,
@@ -90,16 +90,16 @@
         "Performs a greeting action, which could involve a wave or other friendly gesture.",
     ),
     ("Stretch", 1017, "Engages the robot in a stretching routine."),
-    (
-        "TrajectoryFollow",
-        1018,
-        "Directs the robot to follow a predefined trajectory, which could involve complex paths or maneuvers.",
-    ),
-    (
-        "ContinuousGait",
-        1019,
-        "Enables a mode for continuous walking or running, ideal for long-distance travel.",
-    ),
+    # (
+    #     "TrajectoryFollow",
+    #     1018,
+    #     "Directs the robot to follow a predefined trajectory, which could involve complex paths or maneuvers.",
+    # ),
+    # (
+    #     "ContinuousGait",
+    #     1019,
+    #     "Enables a mode for continuous walking or running, ideal for long-distance travel.",
+    # ),
     ("Content", 1020, "To display or trigger when the robot is happy."),
     ("Wallow", 1021, "The robot falls onto its back and rolls around."),
     (
@@ -108,18 +108,18 @@
         "Performs a predefined dance routine 1, programmed for entertainment or demonstration.",
     ),
     ("Dance2", 1023, "Performs another variant of a predefined dance routine 2."),
-    ("GetBodyHeight", 1024, "Retrieves the current height of the robot's body from the ground."),
-    (
-        "GetFootRaiseHeight",
-        1025,
-        "Retrieves the current height at which the robot's feet are being raised during movement.",
-    ),
-    ("GetSpeedLevel", 1026, "Returns the current speed level at which the robot is operating."),
-    (
-        "SwitchJoystick",
-        1027,
-        "Toggles the control mode to joystick input, allowing for manual direction of the robot's movements.",
-    ),
+    # ("GetBodyHeight", 1024, "Retrieves the current height of the robot's body from the ground."),
+    # (
+    #     "GetFootRaiseHeight",
+    #     1025,
+    #     "Retrieves the current height at which the robot's feet are being raised during movement.",
+    # ),
+    # ("GetSpeedLevel", 1026, "Returns the current speed level at which the robot is operating."),
+    # (
+    #     "SwitchJoystick",
+    #     1027,
+    #     "Toggles the control mode to joystick input, allowing for manual direction of the robot's movements.",
+    # ),
     (
         "Pose",
         1028,
@@ -137,46 +137,46 @@
         1032,
         "Initiates a pouncing movement forward, mimicking animal-like pouncing behavior.",
     ),
-    ("WiggleHips", 1033, "Causes the robot to wiggle its hips."),
-    (
-        "GetState",
-        1034,
-        "Retrieves the current operational state of the robot, including status reports or diagnostic information.",
-    ),
-    (
-        "EconomicGait",
-        1035,
-        "Engages a more energy-efficient walking or running mode to conserve battery life.",
-    ),
-    ("FingerHeart", 1036, "Performs a finger heart gesture while on its hind legs."),
-    (
-        "Handstand",
-        1301,
-        "Commands the robot to perform a handstand, demonstrating balance and control.",
-    ),
-    (
-        "CrossStep",
-        1302,
-        "Engages the robot in a cross-stepping routine, useful for complex locomotion or dance moves.",
-    ),
-    (
-        "OnesidedStep",
-        1303,
-        "Commands the robot to perform a stepping motion that predominantly uses one side.",
-    ),
-    (
-        "Bound",
-        1304,
-        "Initiates a bounding motion, similar to a light, repetitive hopping or leaping.",
-    ),
-    (
-        "LeadFollow",
-        1045,
-        "Engages follow-the-leader behavior, where the robot follows a designated leader or follows a signal.",
-    ),
-    ("LeftFlip", 1042, "Executes a flip towards the left side."),
-    ("RightFlip", 1043, "Performs a flip towards the right side."),
-    ("Backflip", 1044, "Executes a backflip, a complex and dynamic maneuver."),
+    # ("WiggleHips", 1033, "Causes the robot to wiggle its hips."),
+    # (
+    #     "GetState",
+    #     1034,
+    #     "Retrieves the current operational state of the robot, including status reports or diagnostic information.",
+    # ),
+    # (
+    #     "EconomicGait",
+    #     1035,
+    #     "Engages a more energy-efficient walking or running mode to conserve battery life.",
+    # ),
+    # ("FingerHeart", 1036, "Performs a finger heart gesture while on its hind legs."),
+    # (
+    #     "Handstand",
+    #     1301,
+    #     "Commands the robot to perform a handstand, demonstrating balance and control.",
+    # ),
+    # (
+    #     "CrossStep",
+    #     1302,
+    #     "Engages the robot in a cross-stepping routine, useful for complex locomotion or dance moves.",
+    # ),
+    # (
+    #     "OnesidedStep",
+    #     1303,
+    #     "Commands the robot to perform a stepping motion that predominantly uses one side.",
+    # ),
+    # (
+    #     "Bound",
+    #     1304,
+    #     "Initiates a bounding motion, similar to a light, repetitive hopping or leaping.",
+    # ),
+    # (
+    #     "LeadFollow",
+    #     1045,
+    #     "Engages follow-the-leader behavior, where the robot follows a designated leader or follows a signal.",
+    # ),
+    # ("LeftFlip", 1042, "Executes a flip towards the left side."),
+    # ("RightFlip", 1043, "Performs a flip towards the right side."),
+    # ("Backflip", 1044, "Executes a backflip, a complex and dynamic maneuver."),
 ]
 
 # region MyUnitreeSkills

From e442e0ce48e126d016b2762bcafc3ea96a4dea2e Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Thu, 10 Jul 2025 03:59:42 -0700
Subject: [PATCH 03/18] Added error catching for failed calls

---
 dimos/agents/claude_agent.py | 65 +++++++++++++++++++++++++++---------
 dimos/skills/skills.py       | 47 +++++++++++++++-----------
 tests/run.py                 | 52 ++++++++++++++---------------
 3 files changed, 102 insertions(+), 62 deletions(-)

diff --git a/dimos/agents/claude_agent.py b/dimos/agents/claude_agent.py
index 53148d17b3..e87b1f47b4 100644
--- a/dimos/agents/claude_agent.py
+++ b/dimos/agents/claude_agent.py
@@ -624,8 +624,11 @@ def _observable_query(
             observer.on_completed()
         except Exception as e:
             logger.error(f"Query failed in {self.dev_name}: {e}")
-            observer.on_error(e)
-            self.response_subject.on_error(e)
+            # Send a user-friendly error message instead of propagating the error
+            error_message = "I apologize, but I'm having trouble processing your request right now. Please try again."
+            observer.on_next(error_message)
+            self.response_subject.on_next(error_message)
+            observer.on_completed()
 
     def _handle_tooling(self, response_message, messages):
         """Executes tools and appends tool-use/result blocks to messages."""
@@ -649,12 +652,36 @@ def _handle_tooling(self, response_message, messages):
             }
             messages.append({"role": "assistant", "content": [tool_use_block]})
 
-            # Execute the tool
-            args = json.loads(tool_call.function.arguments)
-            tool_result = self.skills.call(tool_call.function.name, **args)
-
-            # Add tool result to conversation history
-            if tool_result:
+            try:
+                # Execute the tool
+                args = json.loads(tool_call.function.arguments)
+                tool_result = self.skills.call(tool_call.function.name, **args)
+
+                # Check if the result is an error message
+                if isinstance(tool_result, str) and (
+                    "Error executing skill" in tool_result or "is not available" in tool_result
+                ):
+                    # Log the error but provide a user-friendly message
+                    logger.error(f"Tool execution failed: {tool_result}")
+                    tool_result = "I apologize, but I'm having trouble executing that action right now. Please try again or ask for something else."
+
+                # Add tool result to conversation history
+                if tool_result:
+                    messages.append(
+                        {
+                            "role": "user",
+                            "content": [
+                                {
+                                    "type": "tool_result",
+                                    "tool_use_id": tool_call.id,
+                                    "content": f"{tool_result}",
+                                }
+                            ],
+                        }
+                    )
+            except Exception as e:
+                logger.error(f"Unexpected error executing tool {tool_call.function.name}: {e}")
+                # Add error result to conversation history
                 messages.append(
                     {
                         "role": "user",
@@ -662,7 +689,7 @@ def _handle_tooling(self, response_message, messages):
                             {
                                 "type": "tool_result",
                                 "tool_use_id": tool_call.id,
-                                "content": f"{tool_result}",
+                                "content": "I apologize, but I encountered an error while trying to execute that action. Please try again.",
                             }
                         ],
                     }
@@ -672,13 +699,19 @@ def _tooling_callback(self, response_message):
         """Runs the observable query for each tool call in the current response_message"""
         if not hasattr(response_message, "tool_calls") or not response_message.tool_calls:
             return
-        for tool_call in response_message.tool_calls:
-            tool_name = tool_call.function.name
-            tool_id = tool_call.id
-            self.run_observable_query(
-                query_text=f"Tool {tool_name}, ID: {tool_id} execution complete. Please summarize the results and continue.",
-                thinking_budget_tokens=0,
-            ).run()
+
+        try:
+            for tool_call in response_message.tool_calls:
+                tool_name = tool_call.function.name
+                tool_id = tool_call.id
+                self.run_observable_query(
+                    query_text=f"Tool {tool_name}, ID: {tool_id} execution complete. Please summarize the results and continue.",
+                    thinking_budget_tokens=0,
+                ).run()
+        except Exception as e:
+            logger.error(f"Error in tooling callback: {e}")
+            # Continue processing even if the callback fails
+            pass
 
     def _debug_api_call(self, claude_params: dict):
         """Debugging function to log API calls with truncated base64 data."""
diff --git a/dimos/skills/skills.py b/dimos/skills/skills.py
index 559f9b0e36..f6c7456d24 100644
--- a/dimos/skills/skills.py
+++ b/dimos/skills/skills.py
@@ -121,28 +121,35 @@ def create_instance(self, name, **kwargs):
             print(f"Stored args for later instance creation: {name} with args: {kwargs}")
 
     def call(self, name, **args):
-        # Get the stored args if available; otherwise, use an empty dict
-        stored_args = self._instances.get(name, {})
-
-        # Merge the arguments with priority given to stored arguments
-        complete_args = {**args, **stored_args}
-
-        # Dynamically get the class from the module or current script
-        skill_class = getattr(self, name, None)
-        if skill_class is None:
-            for skill in self.get():
-                if name == skill.__name__:
-                    skill_class = skill
-                    break
-            if skill_class is None:
-                raise ValueError(f"Skill class not found: {name}")
+        try:
+            # Get the stored args if available; otherwise, use an empty dict
+            stored_args = self._instances.get(name, {})
 
-        # Initialize the instance with the merged arguments
-        instance = skill_class(**complete_args)
-        print(f"Instance created and function called for: {name} with args: {complete_args}")
+            # Merge the arguments with priority given to stored arguments
+            complete_args = {**args, **stored_args}
 
-        # Call the instance directly
-        return instance()
+            # Dynamically get the class from the module or current script
+            skill_class = getattr(self, name, None)
+            if skill_class is None:
+                for skill in self.get():
+                    if name == skill.__name__:
+                        skill_class = skill
+                        break
+                if skill_class is None:
+                    error_msg = f"Skill '{name}' is not available. Please check if it's properly registered."
+                    logger.error(f"Skill class not found: {name}")
+                    return error_msg
+
+            # Initialize the instance with the merged arguments
+            instance = skill_class(**complete_args)
+            print(f"Instance created and function called for: {name} with args: {complete_args}")
+
+            # Call the instance directly
+            return instance()
+        except Exception as e:
+            error_msg = f"Error executing skill '{name}': {str(e)}"
+            logger.error(error_msg)
+            return error_msg
 
     # ==== Tools ====
 
diff --git a/tests/run.py b/tests/run.py
index 6f7fbf9d0c..23c6738c48 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -216,32 +216,30 @@ def newmap(msg):
 min_confidence = 0.6
 class_filter = None  # No class filtering
 min_confidence = 0.99  # temporarily disable detections
-# detector = Detic2DDetector(vocabulary=None, threshold=min_confidence)
 
 # Create video stream from robot's camera
-video_stream = robot.get_video_stream()  # WebRTC doesn't use ROS video stream
+video_stream = backpressure(robot.get_video_stream())  # WebRTC doesn't use ROS video stream
 
 # # Initialize ObjectDetectionStream with robot
-# object_detector = ObjectDetectionStream(
-#     camera_intrinsics=robot.camera_intrinsics,
-#     min_confidence=min_confidence,
-#     class_filter=class_filter,
-#     get_pose=robot.get_pose,
-#     detector=detector,
-#     video_stream=video_stream,
-# )
+object_detector = ObjectDetectionStream(
+    camera_intrinsics=robot.camera_intrinsics,
+    min_confidence=min_confidence,
+    class_filter=class_filter,
+    get_pose=robot.get_pose,
+    video_stream=video_stream,
+)
 
 # # Create visualization stream for web interface
-# viz_stream = backpressure(object_detector.get_stream()).pipe(
-#     ops.share(),
-#     ops.map(lambda x: x["viz_frame"] if x is not None else None),
-#     ops.filter(lambda x: x is not None),
-# )
+viz_stream = backpressure(object_detector.get_stream()).pipe(
+    ops.share(),
+    ops.map(lambda x: x["viz_frame"] if x is not None else None),
+    ops.filter(lambda x: x is not None),
+)
 
 # # Get the formatted detection stream
-# formatted_detection_stream = object_detector.get_formatted_stream().pipe(
-#     ops.filter(lambda x: x is not None)
-# )
+formatted_detection_stream = object_detector.get_formatted_stream().pipe(
+    ops.filter(lambda x: x is not None)
+)
 
 
 # Create a direct mapping that combines detection data with locations
@@ -272,12 +270,12 @@ def combine_with_locations(object_detections):
 
 
 # Create the combined stream with a simple pipe operation
-# enhanced_data_stream = formatted_detection_stream.pipe(ops.map(combine_with_locations), ops.share())
+enhanced_data_stream = formatted_detection_stream.pipe(ops.map(combine_with_locations), ops.share())
 
 streams = {
     "unitree_video": robot.get_video_stream(),  # Changed from get_ros_video_stream to get_video_stream for WebRTC
     "local_planner_viz": local_planner_viz_stream,
-    # "object_detection": viz_stream,  # Uncommented object detection
+    "object_detection": viz_stream,  # Uncommented object detection
 }
 text_streams = {
     "agent_responses": agent_response_stream,
@@ -294,15 +292,17 @@ def combine_with_locations(object_detections):
     system_query = f.read()
 
 # Create a ClaudeAgent instance
-agent = CerebrasAgent(
+agent = ClaudeAgent(
     dev_name="test_agent",
     # input_query_stream=stt_node.emit_text(),
     input_query_stream=web_interface.query_stream,
+    input_data_stream=enhanced_data_stream,
     skills=robot.get_skills(),
     system_query=system_query,
-    # model_name="claude-3-7-sonnet-latest",
-    # thinking_budget_tokens=0,
-    model_name="llama-4-scout-17b-16e-instruct",
+    model_name="claude-3-5-haiku-latest",
+    thinking_budget_tokens=0,
+    max_output_tokens_per_request=8192,
+    # model_name="llama-4-scout-17b-16e-instruct",
 )
 
 # tts_node = tts()
@@ -313,7 +313,7 @@ def combine_with_locations(object_detections):
 robot_skills.add(Observe)
 robot_skills.add(KillSkill)
 robot_skills.add(NavigateWithText)
-robot_skills.add(FollowHuman)
+# robot_skills.add(FollowHuman) # TODO: broken
 robot_skills.add(GetPose)
 # robot_skills.add(Speak)
 robot_skills.add(NavigateToGoal)
@@ -323,7 +323,7 @@ def combine_with_locations(object_detections):
 robot_skills.create_instance("Observe", robot=robot, agent=agent)
 robot_skills.create_instance("KillSkill", robot=robot, skill_library=robot_skills)
 robot_skills.create_instance("NavigateWithText", robot=robot)
-robot_skills.create_instance("FollowHuman", robot=robot)
+# robot_skills.create_instance("FollowHuman", robot=robot)
 robot_skills.create_instance("GetPose", robot=robot)
 robot_skills.create_instance("NavigateToGoal", robot=robot)
 robot_skills.create_instance("Explore", robot=robot)

From 4fcd9b2c2f6928acc1f176ceecc22179b4ec9e29 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Thu, 10 Jul 2025 04:28:09 -0700
Subject: [PATCH 04/18] Fixed broken due to transform_robot_to_map() breaking
 update

---
 dimos/perception/object_detection_stream.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dimos/perception/object_detection_stream.py b/dimos/perception/object_detection_stream.py
index 552a31acfe..3284d99f8b 100644
--- a/dimos/perception/object_detection_stream.py
+++ b/dimos/perception/object_detection_stream.py
@@ -205,7 +205,7 @@ def process_frame(frame):
                             # position and rotation are already Vector objects, no need to convert
                             robot_pose = self.get_pose()
                             position, rotation = transform_robot_to_map(
-                                robot_pose, position, rotation
+                                robot_pose["position"], robot_pose["rotation"], position, rotation
                             )
                     except Exception as e:
                         logger.error(f"Error transforming to map frame: {e}")

From 4769796eec5f93e7b838135219ab63134ef3a350 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Thu, 10 Jul 2025 04:28:56 -0700
Subject: [PATCH 05/18] Fully working obj det stream

---
 tests/run.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/run.py b/tests/run.py
index 23c6738c48..e21264a069 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -215,7 +215,6 @@ def newmap(msg):
 # Initialize object detection stream
 min_confidence = 0.6
 class_filter = None  # No class filtering
-min_confidence = 0.99  # temporarily disable detections
 
 # Create video stream from robot's camera
 video_stream = backpressure(robot.get_video_stream())  # WebRTC doesn't use ROS video stream
@@ -223,10 +222,10 @@ def newmap(msg):
 # # Initialize ObjectDetectionStream with robot
 object_detector = ObjectDetectionStream(
     camera_intrinsics=robot.camera_intrinsics,
-    min_confidence=min_confidence,
     class_filter=class_filter,
     get_pose=robot.get_pose,
     video_stream=video_stream,
+    draw_masks=True,
 )
 
 # # Create visualization stream for web interface

From 064710c51e9c4e5dadb2d2fbf47b60767b7752e2 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Thu, 10 Jul 2025 04:38:09 -0700
Subject: [PATCH 06/18] Voice interface integrated and working on localhost

---
 dimos/web/voice_web_interface.py | 228 +++++++++++++++++++++++++++++++
 tests/run.py                     |  16 ++-
 2 files changed, 241 insertions(+), 3 deletions(-)
 create mode 100644 dimos/web/voice_web_interface.py

diff --git a/dimos/web/voice_web_interface.py b/dimos/web/voice_web_interface.py
new file mode 100644
index 0000000000..c5ab8c0d04
--- /dev/null
+++ b/dimos/web/voice_web_interface.py
@@ -0,0 +1,228 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import io
+import time
+import logging
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import ffmpeg
+import soundfile as sf
+import reactivex as rx
+from fastapi import FastAPI, UploadFile, File
+from fastapi.responses import HTMLResponse, JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from reactivex.subject import Subject
+from reactivex import operators as ops
+
+from dimos.web.edge_io import EdgeIO
+from dimos.stream.audio.base import AudioEvent
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+class VoiceWebInterface(EdgeIO):
+    """A minimal FastAPI server that captures audio from the browser and exposes it
+    as a ReactiveX ``Observable`` of :class:`dimos.stream.audio.base.AudioEvent`.
+
+    The browser side records audio using the MediaRecorder API and sends it as a
+    single *webm* blob to the ``/upload_audio`` endpoint when the user stops
+    recording.  The server converts the blob to mono 16-kHz *wav* using
+    *ffmpeg*, loads it into a NumPy array with *soundfile*, wraps it in an
+    :class:`AudioEvent`, and finally pushes it to an internal
+    ``Subject``.  Down-stream components such as the Whisper STT node can simply
+    subscribe to :pyattr:`audio_stream` or call :pyfunc:`emit_audio` to receive
+    the audio events.
+    """
+
+    def __init__(self, host: str = "0.0.0.0", port: int = 5560):
+        super().__init__(dev_name="Voice Web Interface", edge_type="Input")
+
+        self.host = host
+        self.port = port
+
+        # Reactive stream for audio events
+        self._audio_subject: Subject = Subject()
+        # Shared observable so multiple subscribers receive the same events
+        self.audio_stream = self._audio_subject.pipe(ops.share())
+
+        # FastAPI app & CORS for local development
+        self.app = FastAPI()
+        self.app.add_middleware(
+            CORSMiddleware,
+            allow_origins=["*"],
+            allow_credentials=True,
+            allow_methods=["*"],
+            allow_headers=["*"],
+        )
+
+        self._setup_routes()
+
+    # ------------------------------------------------------------------
+    # Public helpers
+    # ------------------------------------------------------------------
+
+    def emit_audio(self):
+        """Return the shared audio observable."""
+        return self.audio_stream
+
+    # ------------------------------------------------------------------
+    # FastAPI routes
+    # ------------------------------------------------------------------
+
+    def _setup_routes(self):
+        @self.app.get("/", response_class=HTMLResponse)
+        async def index():  # noqa: D401 – simple page
+            """Return minimal HTML that records and uploads audio."""
+            return HTMLResponse(content=self._index_html(), status_code=200)
+
+        @self.app.post("/upload_audio")
+        async def upload_audio(file: UploadFile = File(...)):
+            try:
+                data = await file.read()
+                audio_np, sr = self._decode_audio(data)
+                if audio_np is None:
+                    return JSONResponse(
+                        status_code=400,
+                        content={"success": False, "message": "Unable to decode audio"},
+                    )
+
+                event = AudioEvent(
+                    data=audio_np,
+                    sample_rate=sr,
+                    timestamp=time.time(),
+                    channels=1 if audio_np.ndim == 1 else audio_np.shape[1],
+                )
+
+                # Push to reactive stream
+                self._audio_subject.on_next(event)
+                logger.info("Received audio – %.2f s, %d Hz", event.data.shape[0] / sr, sr)
+                return {"success": True}
+            except Exception as e:  # pragma: no cover – runtime safety
+                logger.exception("Failed to process uploaded audio: %s", e)
+                return JSONResponse(status_code=500, content={"success": False, "message": str(e)})
+
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _decode_audio(raw: bytes) -> tuple[Optional[np.ndarray], Optional[int]]:
+        """Convert the *webm/opus* blob sent by the browser into mono 16-kHz PCM.
+
+        Returns (audio, sample_rate) or (None, None) on failure.
+        """
+        try:
+            # Use ffmpeg to convert to 16-kHz mono 16-bit PCM WAV in memory
+            out, _ = (
+                ffmpeg.input("pipe:0")
+                .output(
+                    "pipe:1",
+                    format="wav",
+                    acodec="pcm_s16le",
+                    ac=1,
+                    ar="16000",
+                    loglevel="quiet",
+                )
+                .run(input=raw, capture_stdout=True, capture_stderr=True)
+            )
+            # Load with soundfile (returns float32 by default)
+            audio, sr = sf.read(io.BytesIO(out), dtype="float32")
+            # Ensure 1-D array (mono)
+            if audio.ndim > 1:
+                audio = audio[:, 0]
+            return np.array(audio), sr
+        except Exception as exc:
+            logger.error("ffmpeg decoding failed: %s", exc)
+            return None, None
+
+    @staticmethod
+    def _index_html() -> str:
+        """Return HTML/JS for the voice interface."""
+        return """
+<!DOCTYPE html>
+<html lang=\"en\">
+<head>
+    <meta charset=\"UTF-8\" />
+    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />
+    <title>Voice Command Interface</title>
+    <style>
+        body { font-family: Arial, sans-serif; display:flex; flex-direction:column; align-items:center; padding:40px; }
+        button { padding: 15px 25px; font-size: 18px; cursor: pointer; border: none; border-radius: 6px; }
+        .rec { background:#dc3545; color:#fff; }
+        .idle { background:#28a745; color:#fff; }
+        #status { margin-top:20px; font-weight:bold; }
+    </style>
+</head>
+<body>
+    <h1>Voice Command Interface</h1>
+    <button id=\"recordBtn\" class=\"idle\">🎤 Start Recording</button>
+    <div id=\"status\"></div>
+
+<script>
+let mediaRecorder;
+let chunks = [];
+const btn = document.getElementById('recordBtn');
+const statusDiv = document.getElementById('status');
+
+btn.addEventListener('click', async () => {
+    if (mediaRecorder && mediaRecorder.state === 'recording') {
+        mediaRecorder.stop();
+        btn.textContent = '🎤 Start Recording';
+        btn.className = 'idle';
+    } else {
+        if (!mediaRecorder) {
+            const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+            mediaRecorder = new MediaRecorder(stream);
+            mediaRecorder.ondataavailable = e => chunks.push(e.data);
+            mediaRecorder.onstop = async () => {
+                const blob = new Blob(chunks, { type: 'audio/webm' });
+                chunks = [];
+                statusDiv.textContent = 'Uploading…';
+                const formData = new FormData();
+                formData.append('file', blob, 'recording.webm');
+                try {
+                    const res = await fetch('/upload_audio', { method: 'POST', body: formData });
+                    const json = await res.json();
+                    statusDiv.textContent = json.success ? 'Uploaded!' : ('Error: ' + json.message);
+                } catch(err){
+                    statusDiv.textContent = 'Upload failed';
+                }
+            };
+        }
+        mediaRecorder.start();
+        btn.textContent = '⏹️ Stop Recording';
+        btn.className = 'rec';
+        statusDiv.textContent = 'Recording…';
+    }
+});
+</script>
+</body>
+</html>
+"""
+
+    # ------------------------------------------------------------------
+    # Server runner
+    # ------------------------------------------------------------------
+
+    def run(self):
+        """Run the FastAPI application using Uvicorn."""
+        import uvicorn
+
+        uvicorn.run(self.app, host=self.host, port=self.port)
diff --git a/tests/run.py b/tests/run.py
index e21264a069..72b05ea2df 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -24,6 +24,7 @@
 # from dimos.robot.unitree.unitree_ros_control import UnitreeROSControl
 from dimos.robot.unitree.unitree_skills import MyUnitreeSkills
 from dimos.web.robot_web_interface import RobotWebInterface
+from dimos.web.voice_web_interface import VoiceWebInterface
 from dimos.web.websocket_vis.server import WebsocketVis
 from dimos.skills.observe_stream import ObserveStream
 from dimos.skills.observe import Observe
@@ -282,7 +283,11 @@ def combine_with_locations(object_detections):
 
 web_interface = RobotWebInterface(port=5555, text_streams=text_streams, **streams)
 
-# stt_node = stt()
+# Initialize voice web interface
+voice_web_interface = VoiceWebInterface(port=5560)
+
+stt_node = stt()
+stt_node.consume_audio(voice_web_interface.emit_audio())
 
 # Read system query from prompt.txt file
 with open(
@@ -293,8 +298,8 @@ def combine_with_locations(object_detections):
 # Create a ClaudeAgent instance
 agent = ClaudeAgent(
     dev_name="test_agent",
-    # input_query_stream=stt_node.emit_text(),
-    input_query_stream=web_interface.query_stream,
+    input_query_stream=stt_node.emit_text(),
+    # input_query_stream=web_interface.query_stream,
     input_data_stream=enhanced_data_stream,
     skills=robot.get_skills(),
     system_query=system_query,
@@ -339,6 +344,11 @@ def combine_with_locations(object_detections):
 web_thread.daemon = True
 web_thread.start()
 
+# Start voice web interface in a separate thread to avoid blocking
+voice_thread = threading.Thread(target=voice_web_interface.run)
+voice_thread.daemon = True
+voice_thread.start()
+
 try:
     while True:
         # Main loop - can add robot movement or other logic here

From 512b1a9dc1a57f9070f1f486123031f6d9db90a9 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Thu, 10 Jul 2025 05:07:35 -0700
Subject: [PATCH 07/18] Working voice interface integrated to localhost:5555
 backend

---
 dimos/web/dimos_interface/api/server.py       |  73 ++++++++-
 .../api/templates/index_fastapi.html          | 148 ++++++++++++++++++
 dimos/web/robot_web_interface.py              |   3 +-
 tests/run.py                                  |  16 +-
 4 files changed, 227 insertions(+), 13 deletions(-)

diff --git a/dimos/web/dimos_interface/api/server.py b/dimos/web/dimos_interface/api/server.py
index 812973e300..bcc590ab46 100644
--- a/dimos/web/dimos_interface/api/server.py
+++ b/dimos/web/dimos_interface/api/server.py
@@ -27,7 +27,7 @@
 # Fast Api & Uvicorn
 import cv2
 from dimos.web.edge_io import EdgeIO
-from fastapi import FastAPI, Request, Form, HTTPException
+from fastapi import FastAPI, Request, Form, HTTPException, UploadFile, File
 from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse
 from sse_starlette.sse import EventSourceResponse
 from fastapi.templating import Jinja2Templates
@@ -42,6 +42,14 @@
 import reactivex as rx
 from fastapi.middleware.cors import CORSMiddleware
 
+# For audio processing
+import io
+import time
+import numpy as np
+import ffmpeg
+import soundfile as sf
+from dimos.stream.audio.base import AudioEvent
+
 # TODO: Resolve threading, start/stop stream functionality.
 
 
@@ -53,6 +61,7 @@ def __init__(
         host="0.0.0.0",
         port=5555,
         text_streams=None,
+        audio_subject=None,
         **streams,
     ):
         print("Starting FastAPIServer initialization...")  # Debug print
@@ -88,6 +97,7 @@ def __init__(
         # Create a Subject for text queries
         self.query_subject = rx.subject.Subject()
         self.query_stream = self.query_subject.pipe(ops.share())
+        self.audio_subject = audio_subject
 
         for key in self.streams:
             if self.streams[key] is not None:
@@ -198,6 +208,33 @@ async def text_stream_generator(self, key):
         finally:
             self.text_clients.remove(client_id)
 
+    @staticmethod
+    def _decode_audio(raw: bytes) -> tuple[np.ndarray, int]:
+        """Convert the webm/opus blob sent by the browser into mono 16-kHz PCM."""
+        try:
+            # Use ffmpeg to convert to 16-kHz mono 16-bit PCM WAV in memory
+            out, _ = (
+                ffmpeg.input("pipe:0")
+                .output(
+                    "pipe:1",
+                    format="wav",
+                    acodec="pcm_s16le",
+                    ac=1,
+                    ar="16000",
+                    loglevel="quiet",
+                )
+                .run(input=raw, capture_stdout=True, capture_stderr=True)
+            )
+            # Load with soundfile (returns float32 by default)
+            audio, sr = sf.read(io.BytesIO(out), dtype="float32")
+            # Ensure 1-D array (mono)
+            if audio.ndim > 1:
+                audio = audio[:, 0]
+            return np.array(audio), sr
+        except Exception as exc:
+            print(f"ffmpeg decoding failed: {exc}")
+            return None, None
+
     def setup_routes(self):
         """Set up FastAPI routes."""
 
@@ -221,6 +258,7 @@ async def index(request: Request):
                     "request": request,
                     "stream_keys": stream_keys,
                     "text_stream_keys": text_stream_keys,
+                    "has_voice": self.audio_subject is not None,
                 },
             )
 
@@ -240,6 +278,39 @@ async def submit_query(query: str = Form(...)):
                     content={"success": False, "message": f"Server error: {str(e)}"},
                 )
 
+        @self.app.post("/upload_audio")
+        async def upload_audio(file: UploadFile = File(...)):
+            """Handle audio upload from the browser."""
+            if self.audio_subject is None:
+                return JSONResponse(
+                    status_code=400,
+                    content={"success": False, "message": "Voice input not configured"},
+                )
+
+            try:
+                data = await file.read()
+                audio_np, sr = self._decode_audio(data)
+                if audio_np is None:
+                    return JSONResponse(
+                        status_code=400,
+                        content={"success": False, "message": "Unable to decode audio"},
+                    )
+
+                event = AudioEvent(
+                    data=audio_np,
+                    sample_rate=sr,
+                    timestamp=time.time(),
+                    channels=1 if audio_np.ndim == 1 else audio_np.shape[1],
+                )
+
+                # Push to reactive stream
+                self.audio_subject.on_next(event)
+                print(f"Received audio – {event.data.shape[0] / sr:.2f} s, {sr} Hz")
+                return {"success": True}
+            except Exception as e:
+                print(f"Failed to process uploaded audio: {e}")
+                return JSONResponse(status_code=500, content={"success": False, "message": str(e)})
+
         # Unitree API endpoints
         @self.app.get("/unitree/status")
         async def unitree_status():
diff --git a/dimos/web/dimos_interface/api/templates/index_fastapi.html b/dimos/web/dimos_interface/api/templates/index_fastapi.html
index 94cf0be328..406557c04a 100644
--- a/dimos/web/dimos_interface/api/templates/index_fastapi.html
+++ b/dimos/web/dimos_interface/api/templates/index_fastapi.html
@@ -130,6 +130,7 @@
         .query-form {
             display: flex;
             gap: 10px;
+            align-items: center;
         }
         
         .query-input {
@@ -155,6 +156,81 @@
             background-color: #218838;
         }
         
+        /* Voice button styles */
+        .voice-button {
+            width: 50px;
+            height: 50px;
+            border-radius: 50%;
+            background-color: #dc3545;
+            color: white;
+            border: none;
+            cursor: pointer;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            font-size: 24px;
+            transition: all 0.3s ease;
+            box-shadow: 0 2px 5px rgba(0,0,0,0.2);
+            position: relative;
+        }
+        
+        .voice-button:hover {
+            transform: scale(1.1);
+            box-shadow: 0 4px 8px rgba(0,0,0,0.3);
+        }
+        
+        .voice-button.recording {
+            background-color: #ff0000;
+            animation: pulse 1.5s infinite;
+        }
+        
+        .voice-button.recording::after {
+            content: '';
+            position: absolute;
+            top: -10px;
+            left: -10px;
+            right: -10px;
+            bottom: -10px;
+            border: 3px solid rgba(255, 0, 0, 0.5);
+            border-radius: 50%;
+            animation: ripple 1.5s infinite;
+        }
+        
+        @keyframes pulse {
+            0% { transform: scale(1); }
+            50% { transform: scale(1.05); }
+            100% { transform: scale(1); }
+        }
+        
+        @keyframes ripple {
+            0% {
+                transform: scale(1);
+                opacity: 1;
+            }
+            100% {
+                transform: scale(1.2);
+                opacity: 0;
+            }
+        }
+        
+        .voice-status {
+            position: absolute;
+            top: -25px;
+            left: 50%;
+            transform: translateX(-50%);
+            background-color: rgba(0,0,0,0.8);
+            color: white;
+            padding: 4px 8px;
+            border-radius: 4px;
+            font-size: 12px;
+            white-space: nowrap;
+            display: none;
+        }
+        
+        .voice-button.recording .voice-status {
+            display: block;
+        }
+        
         .query-response {
             margin-top: 15px;
             padding: 10px;
@@ -218,6 +294,12 @@ <h2>Ask a Question</h2>
         <form id="queryForm" class="query-form">
             <input type="text" id="queryInput" class="query-input" placeholder="Type your query here..." required>
             <button type="submit" class="query-button">Send</button>
+            {% if has_voice %}
+            <button type="button" id="voiceButton" class="voice-button">
+                <span class="voice-status">Recording...</span>
+                🎤
+            </button>
+            {% endif %}
         </form>
         <div id="queryResponse" class="query-response"></div>
     </div>
@@ -277,6 +359,72 @@ <h2>{{ key.replace('_', ' ').title() }}</h2>
             window.location.reload(true);
         }
 
+        // Voice recording functionality
+        {% if has_voice %}
+        let mediaRecorder;
+        let chunks = [];
+        const voiceBtn = document.getElementById('voiceButton');
+        const queryResponse = document.getElementById('queryResponse');
+
+        voiceBtn.addEventListener('click', async () => {
+            if (mediaRecorder && mediaRecorder.state === 'recording') {
+                // Stop recording
+                mediaRecorder.stop();
+                voiceBtn.classList.remove('recording');
+            } else {
+                // Start recording
+                try {
+                    if (!mediaRecorder) {
+                        const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+                        mediaRecorder = new MediaRecorder(stream);
+                        
+                        mediaRecorder.ondataavailable = e => chunks.push(e.data);
+                        
+                        mediaRecorder.onstop = async () => {
+                            const blob = new Blob(chunks, { type: 'audio/webm' });
+                            chunks = [];
+                            
+                            // Show uploading status
+                            queryResponse.textContent = 'Processing voice command...';
+                            queryResponse.className = 'query-response';
+                            queryResponse.style.display = 'block';
+                            
+                            const formData = new FormData();
+                            formData.append('file', blob, 'recording.webm');
+                            
+                            try {
+                                const res = await fetch('/upload_audio', { 
+                                    method: 'POST', 
+                                    body: formData 
+                                });
+                                const json = await res.json();
+                                
+                                if (json.success) {
+                                    queryResponse.textContent = 'Voice command received!';
+                                    queryResponse.className = 'query-response success';
+                                    setTimeout(() => {
+                                        queryResponse.style.display = 'none';
+                                    }, 3000);
+                                } else {
+                                    queryResponse.textContent = 'Error: ' + json.message;
+                                    queryResponse.className = 'query-response error';
+                                }
+                            } catch (err) {
+                                queryResponse.textContent = 'Upload failed: ' + err.message;
+                                queryResponse.className = 'query-response error';
+                            }
+                        };
+                    }
+                    
+                    mediaRecorder.start();
+                    voiceBtn.classList.add('recording');
+                } catch (err) {
+                    alert('Microphone access denied. Please allow microphone access to use voice commands.');
+                }
+            }
+        });
+        {% endif %}
+
         // Handle query form submission
         document.getElementById('queryForm').addEventListener('submit', async function(e) {
             e.preventDefault();
diff --git a/dimos/web/robot_web_interface.py b/dimos/web/robot_web_interface.py
index 72dcce9d29..33847c0056 100644
--- a/dimos/web/robot_web_interface.py
+++ b/dimos/web/robot_web_interface.py
@@ -23,12 +23,13 @@
 class RobotWebInterface(FastAPIServer):
     """Wrapper class for the dimos-interface FastAPI server."""
 
-    def __init__(self, port=5555, text_streams=None, **streams):
+    def __init__(self, port=5555, text_streams=None, audio_subject=None, **streams):
         super().__init__(
             dev_name="Robot Web Interface",
             edge_type="Bidirectional",
             host="0.0.0.0",
             port=port,
             text_streams=text_streams,
+            audio_subject=audio_subject,
             **streams,
         )
diff --git a/tests/run.py b/tests/run.py
index 72b05ea2df..977742ba34 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -24,7 +24,6 @@
 # from dimos.robot.unitree.unitree_ros_control import UnitreeROSControl
 from dimos.robot.unitree.unitree_skills import MyUnitreeSkills
 from dimos.web.robot_web_interface import RobotWebInterface
-from dimos.web.voice_web_interface import VoiceWebInterface
 from dimos.web.websocket_vis.server import WebsocketVis
 from dimos.skills.observe_stream import ObserveStream
 from dimos.skills.observe import Observe
@@ -212,6 +211,7 @@ def newmap(msg):
 agent_response_subject = rx.subject.Subject()
 agent_response_stream = agent_response_subject.pipe(ops.share())
 local_planner_viz_stream = robot.local_planner_viz_stream.pipe(ops.share())
+audio_subject = rx.subject.Subject()
 
 # Initialize object detection stream
 min_confidence = 0.6
@@ -281,13 +281,12 @@ def combine_with_locations(object_detections):
     "agent_responses": agent_response_stream,
 }
 
-web_interface = RobotWebInterface(port=5555, text_streams=text_streams, **streams)
-
-# Initialize voice web interface
-voice_web_interface = VoiceWebInterface(port=5560)
+web_interface = RobotWebInterface(
+    port=5555, text_streams=text_streams, audio_subject=audio_subject, **streams
+)
 
 stt_node = stt()
-stt_node.consume_audio(voice_web_interface.emit_audio())
+stt_node.consume_audio(audio_subject.pipe(ops.share()))
 
 # Read system query from prompt.txt file
 with open(
@@ -344,11 +343,6 @@ def combine_with_locations(object_detections):
 web_thread.daemon = True
 web_thread.start()
 
-# Start voice web interface in a separate thread to avoid blocking
-voice_thread = threading.Thread(target=voice_web_interface.run)
-voice_thread.daemon = True
-voice_thread.start()
-
 try:
     while True:
         # Main loop - can add robot movement or other logic here

From cb64e92db05d32c474768dac9a8012a28d7171b7 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Thu, 10 Jul 2025 05:59:06 -0700
Subject: [PATCH 08/18] Voice streaming added to dimOS terminal interface fully
 tested

---
 dimos/web/dimos_interface/src/App.svelte      |  22 ++
 .../src/components/Input.svelte               |  60 +++--
 .../src/components/VoiceButton.svelte         | 255 ++++++++++++++++++
 3 files changed, 309 insertions(+), 28 deletions(-)
 create mode 100644 dimos/web/dimos_interface/src/components/VoiceButton.svelte

diff --git a/dimos/web/dimos_interface/src/App.svelte b/dimos/web/dimos_interface/src/App.svelte
index a1ddae4931..c249f3e3ea 100644
--- a/dimos/web/dimos_interface/src/App.svelte
+++ b/dimos/web/dimos_interface/src/App.svelte
@@ -3,7 +3,27 @@
   import Input from './components/Input.svelte';
   import History from './components/History.svelte';
   import StreamViewer from './components/StreamViewer.svelte';
+  import VoiceButton from './components/VoiceButton.svelte';
   import { theme } from './stores/theme';
+  import { history } from './stores/history';
+
+  const handleVoiceCommand = async (event: CustomEvent) => {
+    if (event.detail.success) {
+      // Show voice processing message
+      history.update(h => [...h, { 
+        command: '[voice command]', 
+        outputs: ['Processing voice command...'] 
+      }]);
+      
+      // The actual command will be processed by the agent through the audio pipeline
+      // and will appear in the text stream
+    } else {
+      history.update(h => [...h, { 
+        command: '[voice command]', 
+        outputs: [`Error: ${event.detail.error}`] 
+      }]);
+    }
+  };
 </script>
 
 <svelte:head>
@@ -29,3 +49,5 @@
     <Input />
   </div>
 </main>
+
+<VoiceButton on:voiceCommand={handleVoiceCommand} />
diff --git a/dimos/web/dimos_interface/src/components/Input.svelte b/dimos/web/dimos_interface/src/components/Input.svelte
index 86eba9184d..d7cee270f1 100644
--- a/dimos/web/dimos_interface/src/components/Input.svelte
+++ b/dimos/web/dimos_interface/src/components/Input.svelte
@@ -29,34 +29,7 @@
 
   const handleKeyDown = async (event: KeyboardEvent) => {
     if (event.key === 'Enter') {
-      const [commandName, ...args] = command.split(' ');
-
-      if (import.meta.env.VITE_TRACKING_ENABLED === 'true') {
-        track(commandName, ...args);
-      }
-
-      const commandFunction = commands[commandName];
-
-      if (commandFunction) {
-        const output = await commandFunction(args);
-
-        if (commandName !== 'clear') {
-          if (output && typeof output === 'object' && 'type' in output && output.type === 'STREAM_START') {
-            // Add initial message to history
-            $history = [...$history, { command, outputs: [output.initialMessage] }];
-            // Connect to text stream
-            connectTextStream(output.streamKey);
-          } else {
-            $history = [...$history, { command, outputs: [output] }];
-          }
-        }
-      } else {
-        const output = `${commandName}: command not found`;
-        $history = [...$history, { command, outputs: [output] }];
-      }
-
-      command = '';
-      historyIndex = -1;
+      await executeCommand();
     } else if (event.key === 'ArrowUp') {
       if (historyIndex < $history.length - 1) {
         historyIndex++;
@@ -80,6 +53,37 @@
       $history = [];
     }
   };
+
+  const executeCommand = async () => {
+    const [commandName, ...args] = command.split(' ');
+
+    if (import.meta.env.VITE_TRACKING_ENABLED === 'true') {
+      track(commandName, ...args);
+    }
+
+    const commandFunction = commands[commandName];
+
+    if (commandFunction) {
+      const output = await commandFunction(args);
+
+      if (commandName !== 'clear') {
+        if (output && typeof output === 'object' && 'type' in output && output.type === 'STREAM_START') {
+          // Add initial message to history
+          $history = [...$history, { command, outputs: [output.initialMessage] }];
+          // Connect to text stream
+          connectTextStream(output.streamKey);
+        } else {
+          $history = [...$history, { command, outputs: [output] }];
+        }
+      }
+    } else {
+      const output = `${commandName}: command not found`;
+      $history = [...$history, { command, outputs: [output] }];
+    }
+
+    command = '';
+    historyIndex = -1;
+  };
 </script>
 
 <svelte:window
diff --git a/dimos/web/dimos_interface/src/components/VoiceButton.svelte b/dimos/web/dimos_interface/src/components/VoiceButton.svelte
new file mode 100644
index 0000000000..e09cec0672
--- /dev/null
+++ b/dimos/web/dimos_interface/src/components/VoiceButton.svelte
@@ -0,0 +1,255 @@
+<!--
+ Copyright 2025 Dimensional Inc.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<script lang="ts">
+  import { createEventDispatcher } from 'svelte';
+  import { theme } from '../stores/theme';
+  import { connectTextStream } from '../stores/stream';
+
+  const dispatch = createEventDispatcher();
+
+  let isRecording = false;
+  let mediaRecorder: MediaRecorder | null = null;
+  let chunks: Blob[] = [];
+  let isProcessing = false;
+
+  async function toggleRecording() {
+    if (isRecording && mediaRecorder) {
+      // Stop recording
+      mediaRecorder.stop();
+      isRecording = false;
+    } else {
+      // Start recording
+      try {
+        if (!mediaRecorder) {
+          const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+          mediaRecorder = new MediaRecorder(stream);
+          
+          mediaRecorder.ondataavailable = (e) => chunks.push(e.data);
+          
+          mediaRecorder.onstop = async () => {
+            isProcessing = true;
+            const blob = new Blob(chunks, { type: 'audio/webm' });
+            chunks = [];
+            
+            // Upload to backend
+            const formData = new FormData();
+            formData.append('file', blob, 'recording.webm');
+            
+            try {
+              const res = await fetch('http://0.0.0.0:5555/upload_audio', {
+                method: 'POST',
+                body: formData
+              });
+              
+              const json = await res.json();
+              
+              if (json.success) {
+                // Connect to agent_responses stream to see the output
+                connectTextStream('agent_responses');
+                dispatch('voiceCommand', { success: true });
+              } else {
+                dispatch('voiceCommand', { 
+                  success: false, 
+                  error: json.message 
+                });
+              }
+            } catch (err) {
+              dispatch('voiceCommand', { 
+                success: false, 
+                error: err instanceof Error ? err.message : 'Upload failed' 
+              });
+            } finally {
+              isProcessing = false;
+            }
+          };
+        }
+        
+        mediaRecorder.start();
+        isRecording = true;
+      } catch (err) {
+        dispatch('voiceCommand', { 
+          success: false, 
+          error: 'Microphone access denied' 
+        });
+      }
+    }
+  }
+
+  // Keyboard shortcut support
+  function handleKeyPress(event: KeyboardEvent) {
+    // Ctrl+M or Cmd+M to toggle recording
+    if ((event.ctrlKey || event.metaKey) && event.key === 'm') {
+      event.preventDefault();
+      toggleRecording();
+    }
+  }
+</script>
+
+<svelte:window on:keydown={handleKeyPress} />
+
+<button
+  class="voice-button-fab"
+  class:recording={isRecording}
+  class:processing={isProcessing}
+  on:click={toggleRecording}
+  disabled={isProcessing}
+  style="--theme-color: {$theme.primary}"
+  title={isRecording ? 'Stop recording (Ctrl+M)' : 'Start voice command (Ctrl+M)'}
+>
+  {#if isProcessing}
+    <span class="processing-icon">⟳</span>
+  {:else if isRecording}
+    <span class="mic-icon recording">◉</span>
+  {:else}
+    <span class="mic-icon">🎤</span>
+  {/if}
+</button>
+
+<style>
+  .voice-button-fab {
+    position: fixed;
+    bottom: 30px;
+    right: 30px;
+    width: 120px; /* Increased from 60px to 2x */
+    height: 120px; /* Increased from 60px to 2x */
+    border-radius: 50%;
+    background: #000;
+    border: 2px solid var(--theme-color);
+    color: var(--theme-color);
+    cursor: pointer;
+    font-size: 48px; /* Increased from 24px to 2x */
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    transition: all 0.3s ease;
+    box-shadow: 0 4px 12px rgba(0, 0, 0, 0.5);
+    z-index: 1000;
+  }
+
+  .voice-button-fab:hover:not(:disabled) {
+    background: var(--theme-color);
+    color: #000;
+    box-shadow: 0 6px 20px var(--theme-color);
+    transform: scale(1.1);
+  }
+
+  .voice-button-fab:active:not(:disabled) {
+    transform: scale(0.95);
+  }
+
+  .voice-button-fab:disabled {
+    opacity: 0.5;
+    cursor: not-allowed;
+  }
+
+  .voice-button-fab.recording {
+    animation: pulse 1.5s infinite;
+    border-color: #ff0000;
+    color: #ff0000;
+    background: rgba(255, 0, 0, 0.1);
+  }
+
+  .voice-button-fab.recording:hover {
+    background: #ff0000;
+    color: #000;
+  }
+
+  .voice-button-fab.processing {
+    border-style: dashed;
+  }
+
+  .mic-icon {
+    display: inline-block;
+    transition: transform 0.2s ease;
+    font-size: 56px; /* Increased from 28px to 2x */
+  }
+
+  .mic-icon.recording {
+    color: #ff0000;
+    animation: blink 1s infinite;
+  }
+
+  .processing-icon {
+    display: inline-block;
+    animation: spin 1s linear infinite;
+    font-size: 56px; /* Increased from 28px to 2x */
+  }
+
+  @keyframes pulse {
+    0% { 
+      transform: scale(1);
+      box-shadow: 0 4px 12px rgba(255, 0, 0, 0.4);
+    }
+    50% { 
+      transform: scale(1.05);
+      box-shadow: 0 4px 20px rgba(255, 0, 0, 0.6);
+    }
+    100% { 
+      transform: scale(1);
+      box-shadow: 0 4px 12px rgba(255, 0, 0, 0.4);
+    }
+  }
+
+  @keyframes blink {
+    0%, 100% { opacity: 1; }
+    50% { opacity: 0.5; }
+  }
+
+  @keyframes spin {
+    from { transform: rotate(0deg); }
+    to { transform: rotate(360deg); }
+  }
+
+  /* Terminal-style glow effect */
+  .voice-button-fab.recording::after {
+    content: '';
+    position: absolute;
+    top: -8px;
+    left: -8px;
+    right: -8px;
+    bottom: -8px;
+    border: 2px solid rgba(255, 0, 0, 0.5);
+    border-radius: 50%;
+    animation: ripple 1.5s infinite;
+    pointer-events: none;
+  }
+
+  @keyframes ripple {
+    0% {
+      transform: scale(1);
+      opacity: 0.8;
+    }
+    100% {
+      transform: scale(1.3);
+      opacity: 0;
+    }
+  }
+
+  /* Mobile responsive - slightly smaller on mobile */
+  @media (max-width: 640px) {
+    .voice-button-fab {
+      width: 100px; /* Increased from 50px to 2x */
+      height: 100px; /* Increased from 50px to 2x */
+      bottom: 20px;
+      right: 20px;
+    }
+
+    .mic-icon, .processing-icon {
+      font-size: 44px; /* Increased from 22px to 2x */
+    }
+  }
+</style> 
\ No newline at end of file

From ef9ad98fddeb53a59f1d545f23ab5f94089579eb Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Thu, 10 Jul 2025 06:37:29 -0700
Subject: [PATCH 09/18] Expose interface to all network connections

---
 dimos/web/dimos_interface/vite.config.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dimos/web/dimos_interface/vite.config.ts b/dimos/web/dimos_interface/vite.config.ts
index 296050256a..29be79dd4a 100644
--- a/dimos/web/dimos_interface/vite.config.ts
+++ b/dimos/web/dimos_interface/vite.config.ts
@@ -22,6 +22,7 @@ export default defineConfig({
   plugins: [svelte()],
   server: {
     port: 3000,
+    host: '0.0.0.0',
     watch: {
       // Exclude node_modules, .git and other large directories
       ignored: ['**/node_modules/**', '**/.git/**', '**/dist/**', 'lambda/**'],

From e27c85fa79f2e619426d914844dbb8d250cab9c4 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Thu, 10 Jul 2025 07:55:27 -0700
Subject: [PATCH 10/18] UnitreeSpeak skill on the robot working and speed
 optimized over webrtc

---
 dimos/skills/unitree/__init__.py      |   1 +
 dimos/skills/unitree/unitree_speak.py | 296 ++++++++++++++++++++++++++
 tests/run.py                          |   6 +-
 3 files changed, 300 insertions(+), 3 deletions(-)
 create mode 100644 dimos/skills/unitree/__init__.py
 create mode 100644 dimos/skills/unitree/unitree_speak.py

diff --git a/dimos/skills/unitree/__init__.py b/dimos/skills/unitree/__init__.py
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/dimos/skills/unitree/__init__.py
@@ -0,0 +1 @@
+
diff --git a/dimos/skills/unitree/unitree_speak.py b/dimos/skills/unitree/unitree_speak.py
new file mode 100644
index 0000000000..e121f3c5c2
--- /dev/null
+++ b/dimos/skills/unitree/unitree_speak.py
@@ -0,0 +1,296 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dimos.skills.skills import AbstractRobotSkill
+from pydantic import Field
+import time
+import tempfile
+import os
+import json
+import base64
+import hashlib
+import soundfile as sf
+import numpy as np
+from openai import OpenAI
+from dimos.utils.logging_config import setup_logger
+from go2_webrtc_driver.constants import RTC_TOPIC
+
+logger = setup_logger("dimos.skills.unitree.unitree_speak")
+
+# Audio API constants (from go2_webrtc_driver)
+AUDIO_API = {
+    "GET_AUDIO_LIST": 1001,
+    "SELECT_START_PLAY": 1002,
+    "PAUSE": 1003,
+    "UNSUSPEND": 1004,
+    "SET_PLAY_MODE": 1007,
+    "UPLOAD_AUDIO_FILE": 2001,
+    "ENTER_MEGAPHONE": 4001,
+    "EXIT_MEGAPHONE": 4002,
+    "UPLOAD_MEGAPHONE": 4003,
+}
+
+PLAY_MODES = {
+    "NO_CYCLE": "no_cycle",
+    "SINGLE_CYCLE": "single_cycle",
+    "LIST_LOOP": "list_loop"
+}
+
+
+class UnitreeSpeak(AbstractRobotSkill):
+    """Speak text out loud through the robot's speakers using WebRTC audio upload."""
+
+    text: str = Field(..., description="Text to speak")
+    voice: str = Field(default="echo", description="Voice to use (alloy, echo, fable, onyx, nova, shimmer)")
+    speed: float = Field(default=1.2, description="Speech speed (0.25 to 4.0)")
+    use_megaphone: bool = Field(default=False, description="Use megaphone mode for lower latency (experimental)")
+
+    def __init__(self, **data):
+        super().__init__(**data)
+        self._openai_client = None
+
+    def _get_openai_client(self):
+        if self._openai_client is None:
+            self._openai_client = OpenAI()
+        return self._openai_client
+
+    def _generate_audio(self, text: str) -> bytes:
+        try:
+            client = self._get_openai_client()
+            response = client.audio.speech.create(
+                model="tts-1",
+                voice=self.voice,
+                input=text,
+                speed=self.speed,
+                response_format="mp3"
+            )
+            return response.content
+        except Exception as e:
+            logger.error(f"Error generating audio: {e}")
+            raise
+
+    def _webrtc_request(self, api_id: int, parameter: dict = None):
+        if parameter is None:
+            parameter = {}
+        
+        request_data = {
+            "api_id": api_id,
+            "parameter": json.dumps(parameter) if parameter else "{}"
+        }
+        
+        return self._robot.webrtc_connection.publish_request(
+            RTC_TOPIC["AUDIO_HUB_REQ"],
+            request_data
+        )
+
+    def _upload_audio_to_robot(self, audio_data: bytes, filename: str) -> str:
+        try:
+            file_md5 = hashlib.md5(audio_data).hexdigest()
+            b64_data = base64.b64encode(audio_data).decode('utf-8')
+            
+            chunk_size = 61440
+            chunks = [b64_data[i:i + chunk_size] for i in range(0, len(b64_data), chunk_size)]
+            total_chunks = len(chunks)
+            
+            logger.info(f"Uploading audio '{filename}' in {total_chunks} chunks (optimized)")
+
+            for i, chunk in enumerate(chunks, 1):
+                parameter = {
+                    'file_name': filename,
+                    'file_type': 'wav',
+                    'file_size': len(audio_data),
+                    'current_block_index': i,
+                    'total_block_number': total_chunks,
+                    'block_content': chunk,
+                    'current_block_size': len(chunk),
+                    'file_md5': file_md5,
+                    'create_time': int(time.time() * 1000)
+                }
+                
+                logger.debug(f"Sending chunk {i}/{total_chunks}")
+                response = self._webrtc_request(
+                    AUDIO_API["UPLOAD_AUDIO_FILE"],
+                    parameter
+                )
+            
+            logger.info(f"Audio upload completed for '{filename}'")
+            
+            list_response = self._webrtc_request(AUDIO_API["GET_AUDIO_LIST"], {})
+            
+            if list_response and 'data' in list_response:
+                data_str = list_response.get('data', {}).get('data', '{}')
+                audio_list = json.loads(data_str).get('audio_list', [])
+                
+                for audio in audio_list:
+                    if audio.get('CUSTOM_NAME') == filename:
+                        return audio.get('UNIQUE_ID')
+            
+            logger.warning(f"Could not find uploaded audio '{filename}' in list, using filename as UUID")
+            return filename
+            
+        except Exception as e:
+            logger.error(f"Error uploading audio to robot: {e}")
+            raise
+
+    def _play_audio_on_robot(self, uuid: str):
+        try:
+            self._webrtc_request(
+                AUDIO_API["SET_PLAY_MODE"],
+                {'play_mode': PLAY_MODES["NO_CYCLE"]}
+            )
+            time.sleep(0.1)
+            
+            parameter = {'unique_id': uuid}
+            
+            logger.info(f"Playing audio with UUID: {uuid}")
+            self._webrtc_request(
+                AUDIO_API["SELECT_START_PLAY"],
+                parameter
+            )
+            
+        except Exception as e:
+            logger.error(f"Error playing audio on robot: {e}")
+            raise
+
+    def _stop_audio_playback(self):
+        try:
+            logger.debug("Stopping audio playback")
+            self._webrtc_request(AUDIO_API["PAUSE"], {})
+        except Exception as e:
+            logger.warning(f"Error stopping audio playback: {e}")
+
+    def _upload_and_play_megaphone(self, audio_data: bytes, duration: float):
+        try:
+            logger.debug("Entering megaphone mode")
+            self._webrtc_request(AUDIO_API["ENTER_MEGAPHONE"], {})
+            
+            time.sleep(0.2)
+            
+            b64_data = base64.b64encode(audio_data).decode('utf-8')
+            
+            chunk_size = 4096
+            chunks = [b64_data[i:i + chunk_size] for i in range(0, len(b64_data), chunk_size)]
+            total_chunks = len(chunks)
+            
+            logger.info(f"Uploading megaphone audio in {total_chunks} chunks")
+
+            for i, chunk in enumerate(chunks, 1):
+                parameter = {
+                    'current_block_size': len(chunk),
+                    'block_content': chunk,
+                    'current_block_index': i,
+                    'total_block_number': total_chunks
+                }
+                
+                logger.debug(f"Sending megaphone chunk {i}/{total_chunks}")
+                self._webrtc_request(
+                    AUDIO_API["UPLOAD_MEGAPHONE"],
+                    parameter
+                )
+                
+                if i < total_chunks:
+                    time.sleep(0.05)
+            
+            logger.info("Megaphone audio upload completed, waiting for playback")
+            
+            time.sleep(duration + 1.0)
+            
+        except Exception as e:
+            logger.error(f"Error in megaphone mode: {e}")
+            try:
+                self._webrtc_request(AUDIO_API["EXIT_MEGAPHONE"], {})
+            except:
+                pass
+            raise
+        finally:
+            try:
+                logger.debug("Exiting megaphone mode")
+                self._webrtc_request(AUDIO_API["EXIT_MEGAPHONE"], {})
+                time.sleep(0.1)
+            except Exception as e:
+                logger.warning(f"Error exiting megaphone mode: {e}")
+
+    def __call__(self):
+        super().__call__()
+        
+        if not self._robot:
+            logger.error("No robot instance provided to UnitreeSpeak skill")
+            return "Error: No robot instance available"
+        
+        try:
+            display_text = self.text[:50] + "..." if len(self.text) > 50 else self.text
+            logger.info(f"Speaking: '{display_text}'")
+            
+            logger.debug("Generating audio with OpenAI TTS")
+            audio_data = self._generate_audio(self.text)
+            
+            with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_mp3:
+                tmp_mp3.write(audio_data)
+                tmp_mp3_path = tmp_mp3.name
+            
+            try:
+                audio_array, sample_rate = sf.read(tmp_mp3_path)
+                
+                if audio_array.ndim > 1:
+                    audio_array = np.mean(audio_array, axis=1)
+                
+                target_sample_rate = 22050
+                if sample_rate != target_sample_rate:
+                    logger.debug(f"Resampling from {sample_rate}Hz to {target_sample_rate}Hz")
+                    old_length = len(audio_array)
+                    new_length = int(old_length * target_sample_rate / sample_rate)
+                    old_indices = np.arange(old_length)
+                    new_indices = np.linspace(0, old_length - 1, new_length)
+                    audio_array = np.interp(new_indices, old_indices, audio_array)
+                    sample_rate = target_sample_rate
+                
+                audio_array = audio_array / np.max(np.abs(audio_array))
+                
+                with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_wav:
+                    sf.write(tmp_wav.name, audio_array, sample_rate, format='WAV', subtype='PCM_16')
+                    tmp_wav.seek(0)
+                    wav_data = open(tmp_wav.name, 'rb').read()
+                    os.unlink(tmp_wav.name)
+                
+                logger.info(f"Audio size: {len(wav_data) / 1024:.1f}KB, duration: {len(audio_array) / sample_rate:.1f}s")
+                
+            finally:
+                os.unlink(tmp_mp3_path)
+            
+            if self.use_megaphone:
+                logger.debug("Using megaphone mode for lower latency")
+                duration = len(audio_array) / sample_rate
+                self._upload_and_play_megaphone(wav_data, duration)
+                
+                return f"Spoke: '{display_text}' on robot successfully (megaphone mode)"
+            else:
+                filename = f"speak_{int(time.time() * 1000)}"
+                
+                logger.debug("Uploading audio to robot")
+                uuid = self._upload_audio_to_robot(wav_data, filename)
+                
+                logger.debug("Playing audio on robot")
+                self._play_audio_on_robot(uuid)
+                
+                duration = len(audio_array) / sample_rate
+                logger.debug(f"Waiting {duration:.1f}s for playback to complete")
+                # time.sleep(duration + 0.2)
+                
+                # self._stop_audio_playback()
+                
+                return f"Spoke: '{display_text}' on robot successfully"
+            
+        except Exception as e:
+            logger.error(f"Error in speak skill: {e}")
+            return f"Error speaking text: {str(e)}"
diff --git a/tests/run.py b/tests/run.py
index 977742ba34..9ae6f81398 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -36,7 +36,7 @@
 import threading
 import json
 from dimos.types.vector import Vector
-from dimos.skills.speak import Speak
+from dimos.skills.unitree.unitree_speak import UnitreeSpeak
 
 from dimos.perception.object_detection_stream import ObjectDetectionStream
 from dimos.perception.detection2d.detic_2d_det import Detic2DDetector
@@ -318,7 +318,7 @@ def combine_with_locations(object_detections):
 robot_skills.add(NavigateWithText)
 # robot_skills.add(FollowHuman) # TODO: broken
 robot_skills.add(GetPose)
-# robot_skills.add(Speak)
+robot_skills.add(UnitreeSpeak)  # Re-enable Speak skill
 robot_skills.add(NavigateToGoal)
 robot_skills.add(Explore)
 
@@ -330,7 +330,7 @@ def combine_with_locations(object_detections):
 robot_skills.create_instance("GetPose", robot=robot)
 robot_skills.create_instance("NavigateToGoal", robot=robot)
 robot_skills.create_instance("Explore", robot=robot)
-# robot_skills.create_instance("Speak", tts_node=tts_node)
+robot_skills.create_instance("UnitreeSpeak", robot=robot)  # Now only needs robot instance
 
 # Subscribe to agent responses and send them to the subject
 agent.get_response_observable().subscribe(lambda x: agent_response_subject.on_next(x))

From f42f372e6952262d0fc3f6a0391fd99f229c1b99 Mon Sep 17 00:00:00 2001
From: spomichter <12108168+spomichter@users.noreply.github.com>
Date: Thu, 10 Jul 2025 14:56:07 +0000
Subject: [PATCH 11/18] CI code cleanup

---
 dimos/skills/unitree/unitree_speak.py | 194 ++++++++++++--------------
 1 file changed, 89 insertions(+), 105 deletions(-)

diff --git a/dimos/skills/unitree/unitree_speak.py b/dimos/skills/unitree/unitree_speak.py
index e121f3c5c2..05004398f9 100644
--- a/dimos/skills/unitree/unitree_speak.py
+++ b/dimos/skills/unitree/unitree_speak.py
@@ -41,20 +41,20 @@
     "UPLOAD_MEGAPHONE": 4003,
 }
 
-PLAY_MODES = {
-    "NO_CYCLE": "no_cycle",
-    "SINGLE_CYCLE": "single_cycle",
-    "LIST_LOOP": "list_loop"
-}
+PLAY_MODES = {"NO_CYCLE": "no_cycle", "SINGLE_CYCLE": "single_cycle", "LIST_LOOP": "list_loop"}
 
 
 class UnitreeSpeak(AbstractRobotSkill):
     """Speak text out loud through the robot's speakers using WebRTC audio upload."""
 
     text: str = Field(..., description="Text to speak")
-    voice: str = Field(default="echo", description="Voice to use (alloy, echo, fable, onyx, nova, shimmer)")
+    voice: str = Field(
+        default="echo", description="Voice to use (alloy, echo, fable, onyx, nova, shimmer)"
+    )
     speed: float = Field(default=1.2, description="Speech speed (0.25 to 4.0)")
-    use_megaphone: bool = Field(default=False, description="Use megaphone mode for lower latency (experimental)")
+    use_megaphone: bool = Field(
+        default=False, description="Use megaphone mode for lower latency (experimental)"
+    )
 
     def __init__(self, **data):
         super().__init__(**data)
@@ -69,11 +69,7 @@ def _generate_audio(self, text: str) -> bytes:
         try:
             client = self._get_openai_client()
             response = client.audio.speech.create(
-                model="tts-1",
-                voice=self.voice,
-                input=text,
-                speed=self.speed,
-                response_format="mp3"
+                model="tts-1", voice=self.voice, input=text, speed=self.speed, response_format="mp3"
             )
             return response.content
         except Exception as e:
@@ -83,82 +79,71 @@ def _generate_audio(self, text: str) -> bytes:
     def _webrtc_request(self, api_id: int, parameter: dict = None):
         if parameter is None:
             parameter = {}
-        
-        request_data = {
-            "api_id": api_id,
-            "parameter": json.dumps(parameter) if parameter else "{}"
-        }
-        
+
+        request_data = {"api_id": api_id, "parameter": json.dumps(parameter) if parameter else "{}"}
+
         return self._robot.webrtc_connection.publish_request(
-            RTC_TOPIC["AUDIO_HUB_REQ"],
-            request_data
+            RTC_TOPIC["AUDIO_HUB_REQ"], request_data
         )
 
     def _upload_audio_to_robot(self, audio_data: bytes, filename: str) -> str:
         try:
             file_md5 = hashlib.md5(audio_data).hexdigest()
-            b64_data = base64.b64encode(audio_data).decode('utf-8')
-            
+            b64_data = base64.b64encode(audio_data).decode("utf-8")
+
             chunk_size = 61440
-            chunks = [b64_data[i:i + chunk_size] for i in range(0, len(b64_data), chunk_size)]
+            chunks = [b64_data[i : i + chunk_size] for i in range(0, len(b64_data), chunk_size)]
             total_chunks = len(chunks)
-            
+
             logger.info(f"Uploading audio '{filename}' in {total_chunks} chunks (optimized)")
 
             for i, chunk in enumerate(chunks, 1):
                 parameter = {
-                    'file_name': filename,
-                    'file_type': 'wav',
-                    'file_size': len(audio_data),
-                    'current_block_index': i,
-                    'total_block_number': total_chunks,
-                    'block_content': chunk,
-                    'current_block_size': len(chunk),
-                    'file_md5': file_md5,
-                    'create_time': int(time.time() * 1000)
+                    "file_name": filename,
+                    "file_type": "wav",
+                    "file_size": len(audio_data),
+                    "current_block_index": i,
+                    "total_block_number": total_chunks,
+                    "block_content": chunk,
+                    "current_block_size": len(chunk),
+                    "file_md5": file_md5,
+                    "create_time": int(time.time() * 1000),
                 }
-                
+
                 logger.debug(f"Sending chunk {i}/{total_chunks}")
-                response = self._webrtc_request(
-                    AUDIO_API["UPLOAD_AUDIO_FILE"],
-                    parameter
-                )
-            
+                response = self._webrtc_request(AUDIO_API["UPLOAD_AUDIO_FILE"], parameter)
+
             logger.info(f"Audio upload completed for '{filename}'")
-            
+
             list_response = self._webrtc_request(AUDIO_API["GET_AUDIO_LIST"], {})
-            
-            if list_response and 'data' in list_response:
-                data_str = list_response.get('data', {}).get('data', '{}')
-                audio_list = json.loads(data_str).get('audio_list', [])
-                
+
+            if list_response and "data" in list_response:
+                data_str = list_response.get("data", {}).get("data", "{}")
+                audio_list = json.loads(data_str).get("audio_list", [])
+
                 for audio in audio_list:
-                    if audio.get('CUSTOM_NAME') == filename:
-                        return audio.get('UNIQUE_ID')
-            
-            logger.warning(f"Could not find uploaded audio '{filename}' in list, using filename as UUID")
+                    if audio.get("CUSTOM_NAME") == filename:
+                        return audio.get("UNIQUE_ID")
+
+            logger.warning(
+                f"Could not find uploaded audio '{filename}' in list, using filename as UUID"
+            )
             return filename
-            
+
         except Exception as e:
             logger.error(f"Error uploading audio to robot: {e}")
             raise
 
     def _play_audio_on_robot(self, uuid: str):
         try:
-            self._webrtc_request(
-                AUDIO_API["SET_PLAY_MODE"],
-                {'play_mode': PLAY_MODES["NO_CYCLE"]}
-            )
+            self._webrtc_request(AUDIO_API["SET_PLAY_MODE"], {"play_mode": PLAY_MODES["NO_CYCLE"]})
             time.sleep(0.1)
-            
-            parameter = {'unique_id': uuid}
-            
+
+            parameter = {"unique_id": uuid}
+
             logger.info(f"Playing audio with UUID: {uuid}")
-            self._webrtc_request(
-                AUDIO_API["SELECT_START_PLAY"],
-                parameter
-            )
-            
+            self._webrtc_request(AUDIO_API["SELECT_START_PLAY"], parameter)
+
         except Exception as e:
             logger.error(f"Error playing audio on robot: {e}")
             raise
@@ -174,38 +159,35 @@ def _upload_and_play_megaphone(self, audio_data: bytes, duration: float):
         try:
             logger.debug("Entering megaphone mode")
             self._webrtc_request(AUDIO_API["ENTER_MEGAPHONE"], {})
-            
+
             time.sleep(0.2)
-            
-            b64_data = base64.b64encode(audio_data).decode('utf-8')
-            
+
+            b64_data = base64.b64encode(audio_data).decode("utf-8")
+
             chunk_size = 4096
-            chunks = [b64_data[i:i + chunk_size] for i in range(0, len(b64_data), chunk_size)]
+            chunks = [b64_data[i : i + chunk_size] for i in range(0, len(b64_data), chunk_size)]
             total_chunks = len(chunks)
-            
+
             logger.info(f"Uploading megaphone audio in {total_chunks} chunks")
 
             for i, chunk in enumerate(chunks, 1):
                 parameter = {
-                    'current_block_size': len(chunk),
-                    'block_content': chunk,
-                    'current_block_index': i,
-                    'total_block_number': total_chunks
+                    "current_block_size": len(chunk),
+                    "block_content": chunk,
+                    "current_block_index": i,
+                    "total_block_number": total_chunks,
                 }
-                
+
                 logger.debug(f"Sending megaphone chunk {i}/{total_chunks}")
-                self._webrtc_request(
-                    AUDIO_API["UPLOAD_MEGAPHONE"],
-                    parameter
-                )
-                
+                self._webrtc_request(AUDIO_API["UPLOAD_MEGAPHONE"], parameter)
+
                 if i < total_chunks:
                     time.sleep(0.05)
-            
+
             logger.info("Megaphone audio upload completed, waiting for playback")
-            
+
             time.sleep(duration + 1.0)
-            
+
         except Exception as e:
             logger.error(f"Error in megaphone mode: {e}")
             try:
@@ -223,28 +205,28 @@ def _upload_and_play_megaphone(self, audio_data: bytes, duration: float):
 
     def __call__(self):
         super().__call__()
-        
+
         if not self._robot:
             logger.error("No robot instance provided to UnitreeSpeak skill")
             return "Error: No robot instance available"
-        
+
         try:
             display_text = self.text[:50] + "..." if len(self.text) > 50 else self.text
             logger.info(f"Speaking: '{display_text}'")
-            
+
             logger.debug("Generating audio with OpenAI TTS")
             audio_data = self._generate_audio(self.text)
-            
-            with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_mp3:
+
+            with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp_mp3:
                 tmp_mp3.write(audio_data)
                 tmp_mp3_path = tmp_mp3.name
-            
+
             try:
                 audio_array, sample_rate = sf.read(tmp_mp3_path)
-                
+
                 if audio_array.ndim > 1:
                     audio_array = np.mean(audio_array, axis=1)
-                
+
                 target_sample_rate = 22050
                 if sample_rate != target_sample_rate:
                     logger.debug(f"Resampling from {sample_rate}Hz to {target_sample_rate}Hz")
@@ -254,43 +236,45 @@ def __call__(self):
                     new_indices = np.linspace(0, old_length - 1, new_length)
                     audio_array = np.interp(new_indices, old_indices, audio_array)
                     sample_rate = target_sample_rate
-                
+
                 audio_array = audio_array / np.max(np.abs(audio_array))
-                
-                with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_wav:
-                    sf.write(tmp_wav.name, audio_array, sample_rate, format='WAV', subtype='PCM_16')
+
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav:
+                    sf.write(tmp_wav.name, audio_array, sample_rate, format="WAV", subtype="PCM_16")
                     tmp_wav.seek(0)
-                    wav_data = open(tmp_wav.name, 'rb').read()
+                    wav_data = open(tmp_wav.name, "rb").read()
                     os.unlink(tmp_wav.name)
-                
-                logger.info(f"Audio size: {len(wav_data) / 1024:.1f}KB, duration: {len(audio_array) / sample_rate:.1f}s")
-                
+
+                logger.info(
+                    f"Audio size: {len(wav_data) / 1024:.1f}KB, duration: {len(audio_array) / sample_rate:.1f}s"
+                )
+
             finally:
                 os.unlink(tmp_mp3_path)
-            
+
             if self.use_megaphone:
                 logger.debug("Using megaphone mode for lower latency")
                 duration = len(audio_array) / sample_rate
                 self._upload_and_play_megaphone(wav_data, duration)
-                
+
                 return f"Spoke: '{display_text}' on robot successfully (megaphone mode)"
             else:
                 filename = f"speak_{int(time.time() * 1000)}"
-                
+
                 logger.debug("Uploading audio to robot")
                 uuid = self._upload_audio_to_robot(wav_data, filename)
-                
+
                 logger.debug("Playing audio on robot")
                 self._play_audio_on_robot(uuid)
-                
+
                 duration = len(audio_array) / sample_rate
                 logger.debug(f"Waiting {duration:.1f}s for playback to complete")
                 # time.sleep(duration + 0.2)
-                
+
                 # self._stop_audio_playback()
-                
+
                 return f"Spoke: '{display_text}' on robot successfully"
-            
+
         except Exception as e:
             logger.error(f"Error in speak skill: {e}")
             return f"Error speaking text: {str(e)}"

From 5c4f30522644f2175c8b3132190f0e8154f3d892 Mon Sep 17 00:00:00 2001
From: alexlin2 <alex.lin416@outlook.com>
Date: Thu, 10 Jul 2025 13:38:46 -0700
Subject: [PATCH 12/18] fixed spatial memory cb bug, do recovery after few
 skills

---
 .../wavefront_frontier_goal_selector.py                   | 2 +-
 dimos/robot/unitree_webrtc/unitree_skills.py              | 8 +-------
 dimos/skills/navigation.py                                | 6 ------
 3 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py b/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py
index 5f9032aa28..76f2ddbb0a 100644
--- a/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py
+++ b/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py
@@ -83,7 +83,7 @@ def __init__(
         self,
         min_frontier_size: int = 10,
         occupancy_threshold: int = 65,
-        subsample_resolution: int = 3,
+        subsample_resolution: int = 2,
         min_distance_from_robot: float = 0.5,
         explored_area_buffer: float = 0.5,
         min_distance_from_obstacles: float = 0.6,
diff --git a/dimos/robot/unitree_webrtc/unitree_skills.py b/dimos/robot/unitree_webrtc/unitree_skills.py
index e6bc4ec759..f9dfc1eede 100644
--- a/dimos/robot/unitree_webrtc/unitree_skills.py
+++ b/dimos/robot/unitree_webrtc/unitree_skills.py
@@ -50,14 +50,8 @@
     (
         "RecoveryStand",
         1006,
-        "Recovers the robot to a state from which it can take more commands. Useful to run after multiple dynamic commands like front flips.",
+        "Recovers the robot to a state from which it can take more commands. Useful to run after multiple dynamic commands like front flips, Must run after skills like sit and jump and standup.",
     ),
-    (
-        "Euler",
-        1007,
-        "Adjusts the robot's orientation using Euler angles, providing precise control over its rotation.",
-    ),
-    # ("Move", 1008, "Move the robot using velocity commands."),  # Handled separately
     ("Sit", 1009, "Commands the robot to sit down from a standing or moving stance."),
     (
         "RiseSit",
diff --git a/dimos/skills/navigation.py b/dimos/skills/navigation.py
index 114e3dacb8..b24ee518c1 100644
--- a/dimos/skills/navigation.py
+++ b/dimos/skills/navigation.py
@@ -405,12 +405,6 @@ def stop(self):
                 logger.error(f"Error disposing navigation task: {e}")
             self._navigation_disposable = None
 
-        # Clean up spatial memory if it exists
-        if hasattr(self, "_spatial_memory") and self._spatial_memory is not None:
-            logger.info("Cleaning up spatial memory")
-            self._spatial_memory.cleanup()
-            self._spatial_memory = None
-
         return "Navigate skill stopped successfully."
 
 

From a260626c65fc0ad08128367e0850e4c55b5ef93d Mon Sep 17 00:00:00 2001
From: alexlin2 <alex.lin416@outlook.com>
Date: Thu, 10 Jul 2025 18:15:15 -0700
Subject: [PATCH 13/18] added some prompt changes, added new skip_visual_search
 flag to navigateWithText

---
 assets/agent/prompt.txt    |  8 +++++++-
 dimos/skills/navigation.py | 38 +++++++++++++++++++++++---------------
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/assets/agent/prompt.txt b/assets/agent/prompt.txt
index 8a2ccff020..4987320263 100644
--- a/assets/agent/prompt.txt
+++ b/assets/agent/prompt.txt
@@ -64,7 +64,13 @@ Saved Robot Locations:
 
 ***ALWAYS CHECK FIRST if you can find a navigation query in the Saved Robot Locations before running the NavigateWithText tool call. If a saved location is found, get there with NavigateToGoal.***
 
-***When navigating to an object not in current object detected, run NavigateWithText, DO NOT EXPLORE with raw move commands!!!
+***Don't use object detections for navigating to an object, PRIORITIZE running NavigateWithText. Only use object detections if NavigateWithText fails***
+
+***When running NavigateWithText, set skip_visual_search flag to TRUE if the query is a general location such as kitchen or office, if it fails, then run without this flag***
+
+***When navigating to an object not in current object detected, run NavigateWithText, DO NOT EXPLORE with raw move commands!!!***
+
+***Prioritize on using the query output from observe skill rather than object detections when the user gives a specific visual query such as "Is there a person wearing a red hat in the view?"***
 
 PLANNING & REASONING:
 - You can develop both short-term and long-term plans to achieve complex goals
diff --git a/dimos/skills/navigation.py b/dimos/skills/navigation.py
index b24ee518c1..6d67ae04f2 100644
--- a/dimos/skills/navigation.py
+++ b/dimos/skills/navigation.py
@@ -60,18 +60,24 @@ class NavigateWithText(AbstractRobotSkill):
 
     This skill first attempts to locate an object in the robot's camera view using vision.
     If the object is found, it navigates to it. If not, it falls back to querying the
-    semantic map for a location matching the description. For example, "Find the kitchen"
-    will first look for a kitchen in view, then check the semantic map coordinates where
-    a kitchen was previously observed.
+    semantic map for a location matching the description. For example, "Find the Teddy Bear"
+    will first look for a Teddy Bear in view, then check the semantic map coordinates where
+    a Teddy Bear was previously observed.
 
     CALL THIS SKILL FOR ONE SUBJECT AT A TIME. For example: "Go to the person wearing a blue shirt in the living room",
     you should call this skill twice, once for the person wearing a blue shirt and once for the living room.
+
+    If skip_visual_search is True, this skill will skip the visual search for the object in view.
+    This is useful if you want to navigate to a general location such as a kitchen or office.
+    For example, "Go to the kitchen" will not look for a kitchen in view, but will check the semantic map coordinates where
+    a kitchen was previously observed.
     """
 
     query: str = Field("", description="Text query to search for in the semantic map")
 
     limit: int = Field(1, description="Maximum number of results to return")
     distance: float = Field(1.0, description="Desired distance to maintain from object in meters")
+    skip_visual_search: bool = Field(False, description="Skip visual search for object in view")
     timeout: float = Field(40.0, description="Maximum time to spend navigating in seconds")
 
     def __init__(self, robot=None, **data):
@@ -362,22 +368,24 @@ def __call__(self):
 
         # First, try to find and navigate to the object in camera view
         logger.info(f"First attempting to find and navigate to visible object: '{self.query}'")
-        object_result = self._navigate_to_object()
 
-        if object_result and object_result["success"]:
-            logger.info(f"Successfully navigated to {self.query} in view")
-            return object_result
+        if not self.skip_visual_search:
+            object_result = self._navigate_to_object()
+
+            if object_result and object_result["success"]:
+                logger.info(f"Successfully navigated to {self.query} in view")
+                return object_result
+
+            elif object_result and object_result["failure_reason"] == "Navigation":
+                logger.info(
+                    f"Failed to navigate to {self.query} in view: {object_result.get('error', 'Unknown error')}"
+                )
+                return object_result
 
-        elif object_result and object_result["failure_reason"] == "Navigation":
+            # If object navigation failed, fall back to semantic map
             logger.info(
-                f"Failed to navigate to {self.query} in view: {object_result.get('error', 'Unknown error')}"
+                f"Object not found in view. Falling back to semantic map query for: '{self.query}'"
             )
-            return object_result
-
-        # If object navigation failed, fall back to semantic map
-        logger.info(
-            f"Object not found in view. Falling back to semantic map query for: '{self.query}'"
-        )
 
         return self._navigate_using_semantic_map()
 

From 5674352755bd3641fc302935f19009dc32be4b8a Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Thu, 10 Jul 2025 18:20:00 -0700
Subject: [PATCH 14/18] Fixed observe skill

---
 dimos/models/qwen/video_query.py |  2 +-
 dimos/skills/observe.py          | 12 +++++-------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/dimos/models/qwen/video_query.py b/dimos/models/qwen/video_query.py
index 3471e52268..7eda5f1aed 100644
--- a/dimos/models/qwen/video_query.py
+++ b/dimos/models/qwen/video_query.py
@@ -128,7 +128,7 @@ def query_single_frame(
         openai_client=qwen_client,
         model_name=model_name,
         tokenizer=HuggingFaceTokenizer(model_name=f"Qwen/{model_name}"),
-        max_output_tokens_per_request=100,
+        max_output_tokens_per_request=8192,
         system_query=query,
         pool_scheduler=get_scheduler(),
     )
diff --git a/dimos/skills/observe.py b/dimos/skills/observe.py
index 844df11805..e4c9a1493c 100644
--- a/dimos/skills/observe.py
+++ b/dimos/skills/observe.py
@@ -106,14 +106,8 @@ def __call__(self):
             # Process the frame with Qwen
             response = self._process_frame_with_qwen(frame)
 
-            # Add the response to the conversation history
-            # self._agent.append_to_history(
-            #     f"Observation: {response}",
-            # )
-            response = self._agent.run_observable_query(f"Observation: {response}")
-
             logger.info(f"Added Qwen observation to conversation history")
-            return f"Observation complete: {response[:100]}..."
+            return f"Observation complete: {response}"
 
         except Exception as e:
             error_msg = f"Error in Observe skill: {e}"
@@ -127,6 +121,10 @@ def _get_frame_from_stream(self):
         Returns:
             A single frame from the video stream, or None if no frame is available
         """
+        if self._video_stream is None:
+            logger.error("Video stream is None")
+            return None
+            
         frame = None
         frame_subject = rx.subject.Subject()
 

From be5450fb591740e91572db0d8b0c270d8929427e Mon Sep 17 00:00:00 2001
From: spomichter <12108168+spomichter@users.noreply.github.com>
Date: Fri, 11 Jul 2025 01:21:53 +0000
Subject: [PATCH 15/18] CI code cleanup

---
 dimos/skills/observe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dimos/skills/observe.py b/dimos/skills/observe.py
index e4c9a1493c..067307353a 100644
--- a/dimos/skills/observe.py
+++ b/dimos/skills/observe.py
@@ -124,7 +124,7 @@ def _get_frame_from_stream(self):
         if self._video_stream is None:
             logger.error("Video stream is None")
             return None
-            
+
         frame = None
         frame_subject = rx.subject.Subject()
 

From ee97c9604110d34cfe82af767fdedb89de6a629f Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Thu, 10 Jul 2025 18:57:23 -0700
Subject: [PATCH 16/18] Fix CORS issue

---
 .../src/components/Input.svelte               | 42 +++++++++----------
 .../src/components/VoiceButton.svelte         |  9 +++-
 .../web/dimos_interface/src/stores/stream.ts  | 13 ++++--
 3 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/dimos/web/dimos_interface/src/components/Input.svelte b/dimos/web/dimos_interface/src/components/Input.svelte
index d7cee270f1..3a2b515f3d 100644
--- a/dimos/web/dimos_interface/src/components/Input.svelte
+++ b/dimos/web/dimos_interface/src/components/Input.svelte
@@ -55,34 +55,34 @@
   };
 
   const executeCommand = async () => {
-    const [commandName, ...args] = command.split(' ');
+      const [commandName, ...args] = command.split(' ');
 
-    if (import.meta.env.VITE_TRACKING_ENABLED === 'true') {
-      track(commandName, ...args);
-    }
+      if (import.meta.env.VITE_TRACKING_ENABLED === 'true') {
+        track(commandName, ...args);
+      }
 
-    const commandFunction = commands[commandName];
+      const commandFunction = commands[commandName];
 
-    if (commandFunction) {
-      const output = await commandFunction(args);
+      if (commandFunction) {
+        const output = await commandFunction(args);
 
-      if (commandName !== 'clear') {
-        if (output && typeof output === 'object' && 'type' in output && output.type === 'STREAM_START') {
-          // Add initial message to history
-          $history = [...$history, { command, outputs: [output.initialMessage] }];
-          // Connect to text stream
-          connectTextStream(output.streamKey);
-        } else {
-          $history = [...$history, { command, outputs: [output] }];
+        if (commandName !== 'clear') {
+          if (output && typeof output === 'object' && 'type' in output && output.type === 'STREAM_START') {
+            // Add initial message to history
+            $history = [...$history, { command, outputs: [output.initialMessage] }];
+            // Connect to text stream
+            connectTextStream(output.streamKey);
+          } else {
+            $history = [...$history, { command, outputs: [output] }];
+          }
         }
+      } else {
+        const output = `${commandName}: command not found`;
+        $history = [...$history, { command, outputs: [output] }];
       }
-    } else {
-      const output = `${commandName}: command not found`;
-      $history = [...$history, { command, outputs: [output] }];
-    }
 
-    command = '';
-    historyIndex = -1;
+      command = '';
+      historyIndex = -1;
   };
 </script>
 
diff --git a/dimos/web/dimos_interface/src/components/VoiceButton.svelte b/dimos/web/dimos_interface/src/components/VoiceButton.svelte
index e09cec0672..0f9682519a 100644
--- a/dimos/web/dimos_interface/src/components/VoiceButton.svelte
+++ b/dimos/web/dimos_interface/src/components/VoiceButton.svelte
@@ -21,6 +21,13 @@
 
   const dispatch = createEventDispatcher();
 
+  // Get the server URL dynamically based on current location
+  const getServerUrl = () => {
+    // In production, use the same host as the frontend but on port 5555
+    const hostname = window.location.hostname;
+    return `http://${hostname}:5555`;
+  };
+
   let isRecording = false;
   let mediaRecorder: MediaRecorder | null = null;
   let chunks: Blob[] = [];
@@ -50,7 +57,7 @@
             formData.append('file', blob, 'recording.webm');
             
             try {
-              const res = await fetch('http://0.0.0.0:5555/upload_audio', {
+              const res = await fetch(`${getServerUrl()}/upload_audio`, {
                 method: 'POST',
                 body: formData
               });
diff --git a/dimos/web/dimos_interface/src/stores/stream.ts b/dimos/web/dimos_interface/src/stores/stream.ts
index 4ea07a71d7..eee46f84bf 100644
--- a/dimos/web/dimos_interface/src/stores/stream.ts
+++ b/dimos/web/dimos_interface/src/stores/stream.ts
@@ -18,6 +18,13 @@ import { writable, derived, get } from 'svelte/store';
 import { simulationManager, simulationStore } from '../utils/simulation';
 import { history } from './history';
 
+// Get the server URL dynamically based on current location
+const getServerUrl = () => {
+  // In production, use the same host as the frontend but on port 5555
+  const hostname = window.location.hostname;
+  return `http://${hostname}:5555`;
+};
+
 interface StreamState {
   isVisible: boolean;
   url: string | null;
@@ -65,7 +72,7 @@ export const combinedStreamState = derived(
 // Function to fetch available streams
 async function fetchAvailableStreams(): Promise<string[]> {
   try {
-    const response = await fetch('http://0.0.0.0:5555/streams', {
+    const response = await fetch(`${getServerUrl()}/streams`, {
       headers: {
         'Accept': 'application/json'
       }
@@ -100,7 +107,7 @@ export const showStream = async (streamKey?: string) => {
 
     streamStore.set({
       isVisible: true,
-      url: 'http://0.0.0.0:5555',
+      url: getServerUrl(),
       streamKeys: selectedStreams,
       isLoading: false,
       error: null,
@@ -134,7 +141,7 @@ export const connectTextStream = (key: string): void => {
   }
 
   // Create new EventSource
-  const eventSource = new EventSource(`http://0.0.0.0:5555/text_stream/${key}`);
+  const eventSource = new EventSource(`${getServerUrl()}/text_stream/${key}`);
   textEventSources[key] = eventSource;
   // Handle incoming messages
   eventSource.addEventListener('message', (event) => {

From 56edcfb03e12e82deb9c6b0c0c57145e219c722c Mon Sep 17 00:00:00 2001
From: alexlin2 <alex.lin416@outlook.com>
Date: Thu, 10 Jul 2025 19:13:30 -0700
Subject: [PATCH 17/18] ammended prompt

---
 assets/agent/prompt.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/assets/agent/prompt.txt b/assets/agent/prompt.txt
index 4987320263..b3d159a7df 100644
--- a/assets/agent/prompt.txt
+++ b/assets/agent/prompt.txt
@@ -64,13 +64,13 @@ Saved Robot Locations:
 
 ***ALWAYS CHECK FIRST if you can find a navigation query in the Saved Robot Locations before running the NavigateWithText tool call. If a saved location is found, get there with NavigateToGoal.***
 
-***Don't use object detections for navigating to an object, PRIORITIZE running NavigateWithText. Only use object detections if NavigateWithText fails***
+***Don't use object detections for navigating to an object, ALWAYS run NavigateWithText. Only use object detections if NavigateWithText fails***
 
 ***When running NavigateWithText, set skip_visual_search flag to TRUE if the query is a general location such as kitchen or office, if it fails, then run without this flag***
 
 ***When navigating to an object not in current object detected, run NavigateWithText, DO NOT EXPLORE with raw move commands!!!***
 
-***Prioritize on using the query output from observe skill rather than object detections when the user gives a specific visual query such as "Is there a person wearing a red hat in the view?"***
+***The object detection list is not a comprehensive source of information, when given a visual query like "go to the person wearing a hat" or "Do you see a dog", always Prioritize running observe skill and NavigateWithText***
 
 PLANNING & REASONING:
 - You can develop both short-term and long-term plans to achieve complex goals

From 8fe7a8a5e7ecfc13cf4980ce36d027e4a37d51c6 Mon Sep 17 00:00:00 2001
From: stash <pomichterstash@gmail.com>
Date: Fri, 11 Jul 2025 18:31:53 -0700
Subject: [PATCH 18/18] Delete unused standalone voice web interface

---
 dimos/web/voice_web_interface.py | 228 -------------------------------
 1 file changed, 228 deletions(-)
 delete mode 100644 dimos/web/voice_web_interface.py

diff --git a/dimos/web/voice_web_interface.py b/dimos/web/voice_web_interface.py
deleted file mode 100644
index c5ab8c0d04..0000000000
--- a/dimos/web/voice_web_interface.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright 2025 Dimensional Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import io
-import time
-import logging
-from pathlib import Path
-from typing import Optional
-
-import numpy as np
-import ffmpeg
-import soundfile as sf
-import reactivex as rx
-from fastapi import FastAPI, UploadFile, File
-from fastapi.responses import HTMLResponse, JSONResponse
-from fastapi.middleware.cors import CORSMiddleware
-from reactivex.subject import Subject
-from reactivex import operators as ops
-
-from dimos.web.edge_io import EdgeIO
-from dimos.stream.audio.base import AudioEvent
-
-logger = logging.getLogger(__name__)
-logging.basicConfig(level=logging.INFO)
-
-
-class VoiceWebInterface(EdgeIO):
-    """A minimal FastAPI server that captures audio from the browser and exposes it
-    as a ReactiveX ``Observable`` of :class:`dimos.stream.audio.base.AudioEvent`.
-
-    The browser side records audio using the MediaRecorder API and sends it as a
-    single *webm* blob to the ``/upload_audio`` endpoint when the user stops
-    recording.  The server converts the blob to mono 16-kHz *wav* using
-    *ffmpeg*, loads it into a NumPy array with *soundfile*, wraps it in an
-    :class:`AudioEvent`, and finally pushes it to an internal
-    ``Subject``.  Down-stream components such as the Whisper STT node can simply
-    subscribe to :pyattr:`audio_stream` or call :pyfunc:`emit_audio` to receive
-    the audio events.
-    """
-
-    def __init__(self, host: str = "0.0.0.0", port: int = 5560):
-        super().__init__(dev_name="Voice Web Interface", edge_type="Input")
-
-        self.host = host
-        self.port = port
-
-        # Reactive stream for audio events
-        self._audio_subject: Subject = Subject()
-        # Shared observable so multiple subscribers receive the same events
-        self.audio_stream = self._audio_subject.pipe(ops.share())
-
-        # FastAPI app & CORS for local development
-        self.app = FastAPI()
-        self.app.add_middleware(
-            CORSMiddleware,
-            allow_origins=["*"],
-            allow_credentials=True,
-            allow_methods=["*"],
-            allow_headers=["*"],
-        )
-
-        self._setup_routes()
-
-    # ------------------------------------------------------------------
-    # Public helpers
-    # ------------------------------------------------------------------
-
-    def emit_audio(self):
-        """Return the shared audio observable."""
-        return self.audio_stream
-
-    # ------------------------------------------------------------------
-    # FastAPI routes
-    # ------------------------------------------------------------------
-
-    def _setup_routes(self):
-        @self.app.get("/", response_class=HTMLResponse)
-        async def index():  # noqa: D401 – simple page
-            """Return minimal HTML that records and uploads audio."""
-            return HTMLResponse(content=self._index_html(), status_code=200)
-
-        @self.app.post("/upload_audio")
-        async def upload_audio(file: UploadFile = File(...)):
-            try:
-                data = await file.read()
-                audio_np, sr = self._decode_audio(data)
-                if audio_np is None:
-                    return JSONResponse(
-                        status_code=400,
-                        content={"success": False, "message": "Unable to decode audio"},
-                    )
-
-                event = AudioEvent(
-                    data=audio_np,
-                    sample_rate=sr,
-                    timestamp=time.time(),
-                    channels=1 if audio_np.ndim == 1 else audio_np.shape[1],
-                )
-
-                # Push to reactive stream
-                self._audio_subject.on_next(event)
-                logger.info("Received audio – %.2f s, %d Hz", event.data.shape[0] / sr, sr)
-                return {"success": True}
-            except Exception as e:  # pragma: no cover – runtime safety
-                logger.exception("Failed to process uploaded audio: %s", e)
-                return JSONResponse(status_code=500, content={"success": False, "message": str(e)})
-
-    # ------------------------------------------------------------------
-    # Internal helpers
-    # ------------------------------------------------------------------
-
-    @staticmethod
-    def _decode_audio(raw: bytes) -> tuple[Optional[np.ndarray], Optional[int]]:
-        """Convert the *webm/opus* blob sent by the browser into mono 16-kHz PCM.
-
-        Returns (audio, sample_rate) or (None, None) on failure.
-        """
-        try:
-            # Use ffmpeg to convert to 16-kHz mono 16-bit PCM WAV in memory
-            out, _ = (
-                ffmpeg.input("pipe:0")
-                .output(
-                    "pipe:1",
-                    format="wav",
-                    acodec="pcm_s16le",
-                    ac=1,
-                    ar="16000",
-                    loglevel="quiet",
-                )
-                .run(input=raw, capture_stdout=True, capture_stderr=True)
-            )
-            # Load with soundfile (returns float32 by default)
-            audio, sr = sf.read(io.BytesIO(out), dtype="float32")
-            # Ensure 1-D array (mono)
-            if audio.ndim > 1:
-                audio = audio[:, 0]
-            return np.array(audio), sr
-        except Exception as exc:
-            logger.error("ffmpeg decoding failed: %s", exc)
-            return None, None
-
-    @staticmethod
-    def _index_html() -> str:
-        """Return HTML/JS for the voice interface."""
-        return """
-<!DOCTYPE html>
-<html lang=\"en\">
-<head>
-    <meta charset=\"UTF-8\" />
-    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />
-    <title>Voice Command Interface</title>
-    <style>
-        body { font-family: Arial, sans-serif; display:flex; flex-direction:column; align-items:center; padding:40px; }
-        button { padding: 15px 25px; font-size: 18px; cursor: pointer; border: none; border-radius: 6px; }
-        .rec { background:#dc3545; color:#fff; }
-        .idle { background:#28a745; color:#fff; }
-        #status { margin-top:20px; font-weight:bold; }
-    </style>
-</head>
-<body>
-    <h1>Voice Command Interface</h1>
-    <button id=\"recordBtn\" class=\"idle\">🎤 Start Recording</button>
-    <div id=\"status\"></div>
-
-<script>
-let mediaRecorder;
-let chunks = [];
-const btn = document.getElementById('recordBtn');
-const statusDiv = document.getElementById('status');
-
-btn.addEventListener('click', async () => {
-    if (mediaRecorder && mediaRecorder.state === 'recording') {
-        mediaRecorder.stop();
-        btn.textContent = '🎤 Start Recording';
-        btn.className = 'idle';
-    } else {
-        if (!mediaRecorder) {
-            const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
-            mediaRecorder = new MediaRecorder(stream);
-            mediaRecorder.ondataavailable = e => chunks.push(e.data);
-            mediaRecorder.onstop = async () => {
-                const blob = new Blob(chunks, { type: 'audio/webm' });
-                chunks = [];
-                statusDiv.textContent = 'Uploading…';
-                const formData = new FormData();
-                formData.append('file', blob, 'recording.webm');
-                try {
-                    const res = await fetch('/upload_audio', { method: 'POST', body: formData });
-                    const json = await res.json();
-                    statusDiv.textContent = json.success ? 'Uploaded!' : ('Error: ' + json.message);
-                } catch(err){
-                    statusDiv.textContent = 'Upload failed';
-                }
-            };
-        }
-        mediaRecorder.start();
-        btn.textContent = '⏹️ Stop Recording';
-        btn.className = 'rec';
-        statusDiv.textContent = 'Recording…';
-    }
-});
-</script>
-</body>
-</html>
-"""
-
-    # ------------------------------------------------------------------
-    # Server runner
-    # ------------------------------------------------------------------
-
-    def run(self):
-        """Run the FastAPI application using Uvicorn."""
-        import uvicorn
-
-        uvicorn.run(self.app, host=self.host, port=self.port)