From 582fa32b2eb30eebc28e4447a93fcdeb8cc5ba92 Mon Sep 17 00:00:00 2001 From: stash Date: Thu, 10 Jul 2025 02:57:53 -0700 Subject: [PATCH 01/18] Cerebras added back and working --- tests/run.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/run.py b/tests/run.py index 8ddb2261e0..6f7fbf9d0c 100644 --- a/tests/run.py +++ b/tests/run.py @@ -294,14 +294,15 @@ def combine_with_locations(object_detections): system_query = f.read() # Create a ClaudeAgent instance -agent = ClaudeAgent( +agent = CerebrasAgent( dev_name="test_agent", # input_query_stream=stt_node.emit_text(), input_query_stream=web_interface.query_stream, skills=robot.get_skills(), system_query=system_query, - model_name="claude-3-7-sonnet-latest", - thinking_budget_tokens=0, + # model_name="claude-3-7-sonnet-latest", + # thinking_budget_tokens=0, + model_name="llama-4-scout-17b-16e-instruct", ) # tts_node = tts() From 69f5111ed4b15149b53dcf20b9b7dac000a93866 Mon Sep 17 00:00:00 2001 From: stash Date: Thu, 10 Jul 2025 03:11:31 -0700 Subject: [PATCH 02/18] Removed non-active unitree skills --- dimos/robot/unitree/unitree_skills.py | 176 +++++++++++++------------- 1 file changed, 88 insertions(+), 88 deletions(-) diff --git a/dimos/robot/unitree/unitree_skills.py b/dimos/robot/unitree/unitree_skills.py index d191753294..5029123ed1 100644 --- a/dimos/robot/unitree/unitree_skills.py +++ b/dimos/robot/unitree/unitree_skills.py @@ -51,34 +51,34 @@ 1006, "Recovers the robot to a state from which it can take more commands. Useful to run after multiple dynamic commands like front flips.", ), - ( - "Euler", - 1007, - "Adjusts the robot's orientation using Euler angles, providing precise control over its rotation.", - ), + # ( + # "Euler", + # 1007, + # "Adjusts the robot's orientation using Euler angles, providing precise control over its rotation.", + # ), # ("Move", 1008, "Move the robot using velocity commands."), # Intentionally omitted ("Sit", 1009, "Commands the robot to sit down from a standing or moving stance."), - ( - "RiseSit", - 1010, - "Commands the robot to rise back to a standing position from a sitting posture.", - ), - ( - "SwitchGait", - 1011, - "Switches the robot's walking pattern or style dynamically, suitable for different terrains or speeds.", - ), - ("Trigger", 1012, "Triggers a specific action or custom routine programmed into the robot."), - ( - "BodyHeight", - 1013, - "Adjusts the height of the robot's body from the ground, useful for navigating various obstacles.", - ), - ( - "FootRaiseHeight", - 1014, - "Controls how high the robot lifts its feet during movement, which can be adjusted for different surfaces.", - ), + # ( + # "RiseSit", + # 1010, + # "Commands the robot to rise back to a standing position from a sitting posture.", + # ), + # ( + # "SwitchGait", + # 1011, + # "Switches the robot's walking pattern or style dynamically, suitable for different terrains or speeds.", + # ), + # ("Trigger", 1012, "Triggers a specific action or custom routine programmed into the robot."), + # ( + # "BodyHeight", + # 1013, + # "Adjusts the height of the robot's body from the ground, useful for navigating various obstacles.", + # ), + # ( + # "FootRaiseHeight", + # 1014, + # "Controls how high the robot lifts its feet during movement, which can be adjusted for different surfaces.", + # ), ( "SpeedLevel", 1015, @@ -90,16 +90,16 @@ "Performs a greeting action, which could involve a wave or other friendly gesture.", ), ("Stretch", 1017, "Engages the robot in a stretching routine."), - ( - "TrajectoryFollow", - 1018, - "Directs the robot to follow a predefined trajectory, which could involve complex paths or maneuvers.", - ), - ( - "ContinuousGait", - 1019, - "Enables a mode for continuous walking or running, ideal for long-distance travel.", - ), + # ( + # "TrajectoryFollow", + # 1018, + # "Directs the robot to follow a predefined trajectory, which could involve complex paths or maneuvers.", + # ), + # ( + # "ContinuousGait", + # 1019, + # "Enables a mode for continuous walking or running, ideal for long-distance travel.", + # ), ("Content", 1020, "To display or trigger when the robot is happy."), ("Wallow", 1021, "The robot falls onto its back and rolls around."), ( @@ -108,18 +108,18 @@ "Performs a predefined dance routine 1, programmed for entertainment or demonstration.", ), ("Dance2", 1023, "Performs another variant of a predefined dance routine 2."), - ("GetBodyHeight", 1024, "Retrieves the current height of the robot's body from the ground."), - ( - "GetFootRaiseHeight", - 1025, - "Retrieves the current height at which the robot's feet are being raised during movement.", - ), - ("GetSpeedLevel", 1026, "Returns the current speed level at which the robot is operating."), - ( - "SwitchJoystick", - 1027, - "Toggles the control mode to joystick input, allowing for manual direction of the robot's movements.", - ), + # ("GetBodyHeight", 1024, "Retrieves the current height of the robot's body from the ground."), + # ( + # "GetFootRaiseHeight", + # 1025, + # "Retrieves the current height at which the robot's feet are being raised during movement.", + # ), + # ("GetSpeedLevel", 1026, "Returns the current speed level at which the robot is operating."), + # ( + # "SwitchJoystick", + # 1027, + # "Toggles the control mode to joystick input, allowing for manual direction of the robot's movements.", + # ), ( "Pose", 1028, @@ -137,46 +137,46 @@ 1032, "Initiates a pouncing movement forward, mimicking animal-like pouncing behavior.", ), - ("WiggleHips", 1033, "Causes the robot to wiggle its hips."), - ( - "GetState", - 1034, - "Retrieves the current operational state of the robot, including status reports or diagnostic information.", - ), - ( - "EconomicGait", - 1035, - "Engages a more energy-efficient walking or running mode to conserve battery life.", - ), - ("FingerHeart", 1036, "Performs a finger heart gesture while on its hind legs."), - ( - "Handstand", - 1301, - "Commands the robot to perform a handstand, demonstrating balance and control.", - ), - ( - "CrossStep", - 1302, - "Engages the robot in a cross-stepping routine, useful for complex locomotion or dance moves.", - ), - ( - "OnesidedStep", - 1303, - "Commands the robot to perform a stepping motion that predominantly uses one side.", - ), - ( - "Bound", - 1304, - "Initiates a bounding motion, similar to a light, repetitive hopping or leaping.", - ), - ( - "LeadFollow", - 1045, - "Engages follow-the-leader behavior, where the robot follows a designated leader or follows a signal.", - ), - ("LeftFlip", 1042, "Executes a flip towards the left side."), - ("RightFlip", 1043, "Performs a flip towards the right side."), - ("Backflip", 1044, "Executes a backflip, a complex and dynamic maneuver."), + # ("WiggleHips", 1033, "Causes the robot to wiggle its hips."), + # ( + # "GetState", + # 1034, + # "Retrieves the current operational state of the robot, including status reports or diagnostic information.", + # ), + # ( + # "EconomicGait", + # 1035, + # "Engages a more energy-efficient walking or running mode to conserve battery life.", + # ), + # ("FingerHeart", 1036, "Performs a finger heart gesture while on its hind legs."), + # ( + # "Handstand", + # 1301, + # "Commands the robot to perform a handstand, demonstrating balance and control.", + # ), + # ( + # "CrossStep", + # 1302, + # "Engages the robot in a cross-stepping routine, useful for complex locomotion or dance moves.", + # ), + # ( + # "OnesidedStep", + # 1303, + # "Commands the robot to perform a stepping motion that predominantly uses one side.", + # ), + # ( + # "Bound", + # 1304, + # "Initiates a bounding motion, similar to a light, repetitive hopping or leaping.", + # ), + # ( + # "LeadFollow", + # 1045, + # "Engages follow-the-leader behavior, where the robot follows a designated leader or follows a signal.", + # ), + # ("LeftFlip", 1042, "Executes a flip towards the left side."), + # ("RightFlip", 1043, "Performs a flip towards the right side."), + # ("Backflip", 1044, "Executes a backflip, a complex and dynamic maneuver."), ] # region MyUnitreeSkills From e442e0ce48e126d016b2762bcafc3ea96a4dea2e Mon Sep 17 00:00:00 2001 From: stash Date: Thu, 10 Jul 2025 03:59:42 -0700 Subject: [PATCH 03/18] Added error catching for failed calls --- dimos/agents/claude_agent.py | 65 +++++++++++++++++++++++++++--------- dimos/skills/skills.py | 47 +++++++++++++++----------- tests/run.py | 52 ++++++++++++++--------------- 3 files changed, 102 insertions(+), 62 deletions(-) diff --git a/dimos/agents/claude_agent.py b/dimos/agents/claude_agent.py index 53148d17b3..e87b1f47b4 100644 --- a/dimos/agents/claude_agent.py +++ b/dimos/agents/claude_agent.py @@ -624,8 +624,11 @@ def _observable_query( observer.on_completed() except Exception as e: logger.error(f"Query failed in {self.dev_name}: {e}") - observer.on_error(e) - self.response_subject.on_error(e) + # Send a user-friendly error message instead of propagating the error + error_message = "I apologize, but I'm having trouble processing your request right now. Please try again." + observer.on_next(error_message) + self.response_subject.on_next(error_message) + observer.on_completed() def _handle_tooling(self, response_message, messages): """Executes tools and appends tool-use/result blocks to messages.""" @@ -649,12 +652,36 @@ def _handle_tooling(self, response_message, messages): } messages.append({"role": "assistant", "content": [tool_use_block]}) - # Execute the tool - args = json.loads(tool_call.function.arguments) - tool_result = self.skills.call(tool_call.function.name, **args) - - # Add tool result to conversation history - if tool_result: + try: + # Execute the tool + args = json.loads(tool_call.function.arguments) + tool_result = self.skills.call(tool_call.function.name, **args) + + # Check if the result is an error message + if isinstance(tool_result, str) and ( + "Error executing skill" in tool_result or "is not available" in tool_result + ): + # Log the error but provide a user-friendly message + logger.error(f"Tool execution failed: {tool_result}") + tool_result = "I apologize, but I'm having trouble executing that action right now. Please try again or ask for something else." + + # Add tool result to conversation history + if tool_result: + messages.append( + { + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": tool_call.id, + "content": f"{tool_result}", + } + ], + } + ) + except Exception as e: + logger.error(f"Unexpected error executing tool {tool_call.function.name}: {e}") + # Add error result to conversation history messages.append( { "role": "user", @@ -662,7 +689,7 @@ def _handle_tooling(self, response_message, messages): { "type": "tool_result", "tool_use_id": tool_call.id, - "content": f"{tool_result}", + "content": "I apologize, but I encountered an error while trying to execute that action. Please try again.", } ], } @@ -672,13 +699,19 @@ def _tooling_callback(self, response_message): """Runs the observable query for each tool call in the current response_message""" if not hasattr(response_message, "tool_calls") or not response_message.tool_calls: return - for tool_call in response_message.tool_calls: - tool_name = tool_call.function.name - tool_id = tool_call.id - self.run_observable_query( - query_text=f"Tool {tool_name}, ID: {tool_id} execution complete. Please summarize the results and continue.", - thinking_budget_tokens=0, - ).run() + + try: + for tool_call in response_message.tool_calls: + tool_name = tool_call.function.name + tool_id = tool_call.id + self.run_observable_query( + query_text=f"Tool {tool_name}, ID: {tool_id} execution complete. Please summarize the results and continue.", + thinking_budget_tokens=0, + ).run() + except Exception as e: + logger.error(f"Error in tooling callback: {e}") + # Continue processing even if the callback fails + pass def _debug_api_call(self, claude_params: dict): """Debugging function to log API calls with truncated base64 data.""" diff --git a/dimos/skills/skills.py b/dimos/skills/skills.py index 559f9b0e36..f6c7456d24 100644 --- a/dimos/skills/skills.py +++ b/dimos/skills/skills.py @@ -121,28 +121,35 @@ def create_instance(self, name, **kwargs): print(f"Stored args for later instance creation: {name} with args: {kwargs}") def call(self, name, **args): - # Get the stored args if available; otherwise, use an empty dict - stored_args = self._instances.get(name, {}) - - # Merge the arguments with priority given to stored arguments - complete_args = {**args, **stored_args} - - # Dynamically get the class from the module or current script - skill_class = getattr(self, name, None) - if skill_class is None: - for skill in self.get(): - if name == skill.__name__: - skill_class = skill - break - if skill_class is None: - raise ValueError(f"Skill class not found: {name}") + try: + # Get the stored args if available; otherwise, use an empty dict + stored_args = self._instances.get(name, {}) - # Initialize the instance with the merged arguments - instance = skill_class(**complete_args) - print(f"Instance created and function called for: {name} with args: {complete_args}") + # Merge the arguments with priority given to stored arguments + complete_args = {**args, **stored_args} - # Call the instance directly - return instance() + # Dynamically get the class from the module or current script + skill_class = getattr(self, name, None) + if skill_class is None: + for skill in self.get(): + if name == skill.__name__: + skill_class = skill + break + if skill_class is None: + error_msg = f"Skill '{name}' is not available. Please check if it's properly registered." + logger.error(f"Skill class not found: {name}") + return error_msg + + # Initialize the instance with the merged arguments + instance = skill_class(**complete_args) + print(f"Instance created and function called for: {name} with args: {complete_args}") + + # Call the instance directly + return instance() + except Exception as e: + error_msg = f"Error executing skill '{name}': {str(e)}" + logger.error(error_msg) + return error_msg # ==== Tools ==== diff --git a/tests/run.py b/tests/run.py index 6f7fbf9d0c..23c6738c48 100644 --- a/tests/run.py +++ b/tests/run.py @@ -216,32 +216,30 @@ def newmap(msg): min_confidence = 0.6 class_filter = None # No class filtering min_confidence = 0.99 # temporarily disable detections -# detector = Detic2DDetector(vocabulary=None, threshold=min_confidence) # Create video stream from robot's camera -video_stream = robot.get_video_stream() # WebRTC doesn't use ROS video stream +video_stream = backpressure(robot.get_video_stream()) # WebRTC doesn't use ROS video stream # # Initialize ObjectDetectionStream with robot -# object_detector = ObjectDetectionStream( -# camera_intrinsics=robot.camera_intrinsics, -# min_confidence=min_confidence, -# class_filter=class_filter, -# get_pose=robot.get_pose, -# detector=detector, -# video_stream=video_stream, -# ) +object_detector = ObjectDetectionStream( + camera_intrinsics=robot.camera_intrinsics, + min_confidence=min_confidence, + class_filter=class_filter, + get_pose=robot.get_pose, + video_stream=video_stream, +) # # Create visualization stream for web interface -# viz_stream = backpressure(object_detector.get_stream()).pipe( -# ops.share(), -# ops.map(lambda x: x["viz_frame"] if x is not None else None), -# ops.filter(lambda x: x is not None), -# ) +viz_stream = backpressure(object_detector.get_stream()).pipe( + ops.share(), + ops.map(lambda x: x["viz_frame"] if x is not None else None), + ops.filter(lambda x: x is not None), +) # # Get the formatted detection stream -# formatted_detection_stream = object_detector.get_formatted_stream().pipe( -# ops.filter(lambda x: x is not None) -# ) +formatted_detection_stream = object_detector.get_formatted_stream().pipe( + ops.filter(lambda x: x is not None) +) # Create a direct mapping that combines detection data with locations @@ -272,12 +270,12 @@ def combine_with_locations(object_detections): # Create the combined stream with a simple pipe operation -# enhanced_data_stream = formatted_detection_stream.pipe(ops.map(combine_with_locations), ops.share()) +enhanced_data_stream = formatted_detection_stream.pipe(ops.map(combine_with_locations), ops.share()) streams = { "unitree_video": robot.get_video_stream(), # Changed from get_ros_video_stream to get_video_stream for WebRTC "local_planner_viz": local_planner_viz_stream, - # "object_detection": viz_stream, # Uncommented object detection + "object_detection": viz_stream, # Uncommented object detection } text_streams = { "agent_responses": agent_response_stream, @@ -294,15 +292,17 @@ def combine_with_locations(object_detections): system_query = f.read() # Create a ClaudeAgent instance -agent = CerebrasAgent( +agent = ClaudeAgent( dev_name="test_agent", # input_query_stream=stt_node.emit_text(), input_query_stream=web_interface.query_stream, + input_data_stream=enhanced_data_stream, skills=robot.get_skills(), system_query=system_query, - # model_name="claude-3-7-sonnet-latest", - # thinking_budget_tokens=0, - model_name="llama-4-scout-17b-16e-instruct", + model_name="claude-3-5-haiku-latest", + thinking_budget_tokens=0, + max_output_tokens_per_request=8192, + # model_name="llama-4-scout-17b-16e-instruct", ) # tts_node = tts() @@ -313,7 +313,7 @@ def combine_with_locations(object_detections): robot_skills.add(Observe) robot_skills.add(KillSkill) robot_skills.add(NavigateWithText) -robot_skills.add(FollowHuman) +# robot_skills.add(FollowHuman) # TODO: broken robot_skills.add(GetPose) # robot_skills.add(Speak) robot_skills.add(NavigateToGoal) @@ -323,7 +323,7 @@ def combine_with_locations(object_detections): robot_skills.create_instance("Observe", robot=robot, agent=agent) robot_skills.create_instance("KillSkill", robot=robot, skill_library=robot_skills) robot_skills.create_instance("NavigateWithText", robot=robot) -robot_skills.create_instance("FollowHuman", robot=robot) +# robot_skills.create_instance("FollowHuman", robot=robot) robot_skills.create_instance("GetPose", robot=robot) robot_skills.create_instance("NavigateToGoal", robot=robot) robot_skills.create_instance("Explore", robot=robot) From 4fcd9b2c2f6928acc1f176ceecc22179b4ec9e29 Mon Sep 17 00:00:00 2001 From: stash Date: Thu, 10 Jul 2025 04:28:09 -0700 Subject: [PATCH 04/18] Fixed broken due to transform_robot_to_map() breaking update --- dimos/perception/object_detection_stream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dimos/perception/object_detection_stream.py b/dimos/perception/object_detection_stream.py index 552a31acfe..3284d99f8b 100644 --- a/dimos/perception/object_detection_stream.py +++ b/dimos/perception/object_detection_stream.py @@ -205,7 +205,7 @@ def process_frame(frame): # position and rotation are already Vector objects, no need to convert robot_pose = self.get_pose() position, rotation = transform_robot_to_map( - robot_pose, position, rotation + robot_pose["position"], robot_pose["rotation"], position, rotation ) except Exception as e: logger.error(f"Error transforming to map frame: {e}") From 4769796eec5f93e7b838135219ab63134ef3a350 Mon Sep 17 00:00:00 2001 From: stash Date: Thu, 10 Jul 2025 04:28:56 -0700 Subject: [PATCH 05/18] Fully working obj det stream --- tests/run.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/run.py b/tests/run.py index 23c6738c48..e21264a069 100644 --- a/tests/run.py +++ b/tests/run.py @@ -215,7 +215,6 @@ def newmap(msg): # Initialize object detection stream min_confidence = 0.6 class_filter = None # No class filtering -min_confidence = 0.99 # temporarily disable detections # Create video stream from robot's camera video_stream = backpressure(robot.get_video_stream()) # WebRTC doesn't use ROS video stream @@ -223,10 +222,10 @@ def newmap(msg): # # Initialize ObjectDetectionStream with robot object_detector = ObjectDetectionStream( camera_intrinsics=robot.camera_intrinsics, - min_confidence=min_confidence, class_filter=class_filter, get_pose=robot.get_pose, video_stream=video_stream, + draw_masks=True, ) # # Create visualization stream for web interface From 064710c51e9c4e5dadb2d2fbf47b60767b7752e2 Mon Sep 17 00:00:00 2001 From: stash Date: Thu, 10 Jul 2025 04:38:09 -0700 Subject: [PATCH 06/18] Voice interface integrated and working on localhost --- dimos/web/voice_web_interface.py | 228 +++++++++++++++++++++++++++++++ tests/run.py | 16 ++- 2 files changed, 241 insertions(+), 3 deletions(-) create mode 100644 dimos/web/voice_web_interface.py diff --git a/dimos/web/voice_web_interface.py b/dimos/web/voice_web_interface.py new file mode 100644 index 0000000000..c5ab8c0d04 --- /dev/null +++ b/dimos/web/voice_web_interface.py @@ -0,0 +1,228 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import io +import time +import logging +from pathlib import Path +from typing import Optional + +import numpy as np +import ffmpeg +import soundfile as sf +import reactivex as rx +from fastapi import FastAPI, UploadFile, File +from fastapi.responses import HTMLResponse, JSONResponse +from fastapi.middleware.cors import CORSMiddleware +from reactivex.subject import Subject +from reactivex import operators as ops + +from dimos.web.edge_io import EdgeIO +from dimos.stream.audio.base import AudioEvent + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +class VoiceWebInterface(EdgeIO): + """A minimal FastAPI server that captures audio from the browser and exposes it + as a ReactiveX ``Observable`` of :class:`dimos.stream.audio.base.AudioEvent`. + + The browser side records audio using the MediaRecorder API and sends it as a + single *webm* blob to the ``/upload_audio`` endpoint when the user stops + recording. The server converts the blob to mono 16-kHz *wav* using + *ffmpeg*, loads it into a NumPy array with *soundfile*, wraps it in an + :class:`AudioEvent`, and finally pushes it to an internal + ``Subject``. Down-stream components such as the Whisper STT node can simply + subscribe to :pyattr:`audio_stream` or call :pyfunc:`emit_audio` to receive + the audio events. + """ + + def __init__(self, host: str = "0.0.0.0", port: int = 5560): + super().__init__(dev_name="Voice Web Interface", edge_type="Input") + + self.host = host + self.port = port + + # Reactive stream for audio events + self._audio_subject: Subject = Subject() + # Shared observable so multiple subscribers receive the same events + self.audio_stream = self._audio_subject.pipe(ops.share()) + + # FastAPI app & CORS for local development + self.app = FastAPI() + self.app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + + self._setup_routes() + + # ------------------------------------------------------------------ + # Public helpers + # ------------------------------------------------------------------ + + def emit_audio(self): + """Return the shared audio observable.""" + return self.audio_stream + + # ------------------------------------------------------------------ + # FastAPI routes + # ------------------------------------------------------------------ + + def _setup_routes(self): + @self.app.get("/", response_class=HTMLResponse) + async def index(): # noqa: D401 – simple page + """Return minimal HTML that records and uploads audio.""" + return HTMLResponse(content=self._index_html(), status_code=200) + + @self.app.post("/upload_audio") + async def upload_audio(file: UploadFile = File(...)): + try: + data = await file.read() + audio_np, sr = self._decode_audio(data) + if audio_np is None: + return JSONResponse( + status_code=400, + content={"success": False, "message": "Unable to decode audio"}, + ) + + event = AudioEvent( + data=audio_np, + sample_rate=sr, + timestamp=time.time(), + channels=1 if audio_np.ndim == 1 else audio_np.shape[1], + ) + + # Push to reactive stream + self._audio_subject.on_next(event) + logger.info("Received audio – %.2f s, %d Hz", event.data.shape[0] / sr, sr) + return {"success": True} + except Exception as e: # pragma: no cover – runtime safety + logger.exception("Failed to process uploaded audio: %s", e) + return JSONResponse(status_code=500, content={"success": False, "message": str(e)}) + + # ------------------------------------------------------------------ + # Internal helpers + # ------------------------------------------------------------------ + + @staticmethod + def _decode_audio(raw: bytes) -> tuple[Optional[np.ndarray], Optional[int]]: + """Convert the *webm/opus* blob sent by the browser into mono 16-kHz PCM. + + Returns (audio, sample_rate) or (None, None) on failure. + """ + try: + # Use ffmpeg to convert to 16-kHz mono 16-bit PCM WAV in memory + out, _ = ( + ffmpeg.input("pipe:0") + .output( + "pipe:1", + format="wav", + acodec="pcm_s16le", + ac=1, + ar="16000", + loglevel="quiet", + ) + .run(input=raw, capture_stdout=True, capture_stderr=True) + ) + # Load with soundfile (returns float32 by default) + audio, sr = sf.read(io.BytesIO(out), dtype="float32") + # Ensure 1-D array (mono) + if audio.ndim > 1: + audio = audio[:, 0] + return np.array(audio), sr + except Exception as exc: + logger.error("ffmpeg decoding failed: %s", exc) + return None, None + + @staticmethod + def _index_html() -> str: + """Return HTML/JS for the voice interface.""" + return """ + + + + + + Voice Command Interface + + + +

Voice Command Interface

+ +
+ + + + +""" + + # ------------------------------------------------------------------ + # Server runner + # ------------------------------------------------------------------ + + def run(self): + """Run the FastAPI application using Uvicorn.""" + import uvicorn + + uvicorn.run(self.app, host=self.host, port=self.port) diff --git a/tests/run.py b/tests/run.py index e21264a069..72b05ea2df 100644 --- a/tests/run.py +++ b/tests/run.py @@ -24,6 +24,7 @@ # from dimos.robot.unitree.unitree_ros_control import UnitreeROSControl from dimos.robot.unitree.unitree_skills import MyUnitreeSkills from dimos.web.robot_web_interface import RobotWebInterface +from dimos.web.voice_web_interface import VoiceWebInterface from dimos.web.websocket_vis.server import WebsocketVis from dimos.skills.observe_stream import ObserveStream from dimos.skills.observe import Observe @@ -282,7 +283,11 @@ def combine_with_locations(object_detections): web_interface = RobotWebInterface(port=5555, text_streams=text_streams, **streams) -# stt_node = stt() +# Initialize voice web interface +voice_web_interface = VoiceWebInterface(port=5560) + +stt_node = stt() +stt_node.consume_audio(voice_web_interface.emit_audio()) # Read system query from prompt.txt file with open( @@ -293,8 +298,8 @@ def combine_with_locations(object_detections): # Create a ClaudeAgent instance agent = ClaudeAgent( dev_name="test_agent", - # input_query_stream=stt_node.emit_text(), - input_query_stream=web_interface.query_stream, + input_query_stream=stt_node.emit_text(), + # input_query_stream=web_interface.query_stream, input_data_stream=enhanced_data_stream, skills=robot.get_skills(), system_query=system_query, @@ -339,6 +344,11 @@ def combine_with_locations(object_detections): web_thread.daemon = True web_thread.start() +# Start voice web interface in a separate thread to avoid blocking +voice_thread = threading.Thread(target=voice_web_interface.run) +voice_thread.daemon = True +voice_thread.start() + try: while True: # Main loop - can add robot movement or other logic here From 512b1a9dc1a57f9070f1f486123031f6d9db90a9 Mon Sep 17 00:00:00 2001 From: stash Date: Thu, 10 Jul 2025 05:07:35 -0700 Subject: [PATCH 07/18] Working voice interface integrated to localhost:5555 backend --- dimos/web/dimos_interface/api/server.py | 73 ++++++++- .../api/templates/index_fastapi.html | 148 ++++++++++++++++++ dimos/web/robot_web_interface.py | 3 +- tests/run.py | 16 +- 4 files changed, 227 insertions(+), 13 deletions(-) diff --git a/dimos/web/dimos_interface/api/server.py b/dimos/web/dimos_interface/api/server.py index 812973e300..bcc590ab46 100644 --- a/dimos/web/dimos_interface/api/server.py +++ b/dimos/web/dimos_interface/api/server.py @@ -27,7 +27,7 @@ # Fast Api & Uvicorn import cv2 from dimos.web.edge_io import EdgeIO -from fastapi import FastAPI, Request, Form, HTTPException +from fastapi import FastAPI, Request, Form, HTTPException, UploadFile, File from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse from sse_starlette.sse import EventSourceResponse from fastapi.templating import Jinja2Templates @@ -42,6 +42,14 @@ import reactivex as rx from fastapi.middleware.cors import CORSMiddleware +# For audio processing +import io +import time +import numpy as np +import ffmpeg +import soundfile as sf +from dimos.stream.audio.base import AudioEvent + # TODO: Resolve threading, start/stop stream functionality. @@ -53,6 +61,7 @@ def __init__( host="0.0.0.0", port=5555, text_streams=None, + audio_subject=None, **streams, ): print("Starting FastAPIServer initialization...") # Debug print @@ -88,6 +97,7 @@ def __init__( # Create a Subject for text queries self.query_subject = rx.subject.Subject() self.query_stream = self.query_subject.pipe(ops.share()) + self.audio_subject = audio_subject for key in self.streams: if self.streams[key] is not None: @@ -198,6 +208,33 @@ async def text_stream_generator(self, key): finally: self.text_clients.remove(client_id) + @staticmethod + def _decode_audio(raw: bytes) -> tuple[np.ndarray, int]: + """Convert the webm/opus blob sent by the browser into mono 16-kHz PCM.""" + try: + # Use ffmpeg to convert to 16-kHz mono 16-bit PCM WAV in memory + out, _ = ( + ffmpeg.input("pipe:0") + .output( + "pipe:1", + format="wav", + acodec="pcm_s16le", + ac=1, + ar="16000", + loglevel="quiet", + ) + .run(input=raw, capture_stdout=True, capture_stderr=True) + ) + # Load with soundfile (returns float32 by default) + audio, sr = sf.read(io.BytesIO(out), dtype="float32") + # Ensure 1-D array (mono) + if audio.ndim > 1: + audio = audio[:, 0] + return np.array(audio), sr + except Exception as exc: + print(f"ffmpeg decoding failed: {exc}") + return None, None + def setup_routes(self): """Set up FastAPI routes.""" @@ -221,6 +258,7 @@ async def index(request: Request): "request": request, "stream_keys": stream_keys, "text_stream_keys": text_stream_keys, + "has_voice": self.audio_subject is not None, }, ) @@ -240,6 +278,39 @@ async def submit_query(query: str = Form(...)): content={"success": False, "message": f"Server error: {str(e)}"}, ) + @self.app.post("/upload_audio") + async def upload_audio(file: UploadFile = File(...)): + """Handle audio upload from the browser.""" + if self.audio_subject is None: + return JSONResponse( + status_code=400, + content={"success": False, "message": "Voice input not configured"}, + ) + + try: + data = await file.read() + audio_np, sr = self._decode_audio(data) + if audio_np is None: + return JSONResponse( + status_code=400, + content={"success": False, "message": "Unable to decode audio"}, + ) + + event = AudioEvent( + data=audio_np, + sample_rate=sr, + timestamp=time.time(), + channels=1 if audio_np.ndim == 1 else audio_np.shape[1], + ) + + # Push to reactive stream + self.audio_subject.on_next(event) + print(f"Received audio – {event.data.shape[0] / sr:.2f} s, {sr} Hz") + return {"success": True} + except Exception as e: + print(f"Failed to process uploaded audio: {e}") + return JSONResponse(status_code=500, content={"success": False, "message": str(e)}) + # Unitree API endpoints @self.app.get("/unitree/status") async def unitree_status(): diff --git a/dimos/web/dimos_interface/api/templates/index_fastapi.html b/dimos/web/dimos_interface/api/templates/index_fastapi.html index 94cf0be328..406557c04a 100644 --- a/dimos/web/dimos_interface/api/templates/index_fastapi.html +++ b/dimos/web/dimos_interface/api/templates/index_fastapi.html @@ -130,6 +130,7 @@ .query-form { display: flex; gap: 10px; + align-items: center; } .query-input { @@ -155,6 +156,81 @@ background-color: #218838; } + /* Voice button styles */ + .voice-button { + width: 50px; + height: 50px; + border-radius: 50%; + background-color: #dc3545; + color: white; + border: none; + cursor: pointer; + display: flex; + align-items: center; + justify-content: center; + font-size: 24px; + transition: all 0.3s ease; + box-shadow: 0 2px 5px rgba(0,0,0,0.2); + position: relative; + } + + .voice-button:hover { + transform: scale(1.1); + box-shadow: 0 4px 8px rgba(0,0,0,0.3); + } + + .voice-button.recording { + background-color: #ff0000; + animation: pulse 1.5s infinite; + } + + .voice-button.recording::after { + content: ''; + position: absolute; + top: -10px; + left: -10px; + right: -10px; + bottom: -10px; + border: 3px solid rgba(255, 0, 0, 0.5); + border-radius: 50%; + animation: ripple 1.5s infinite; + } + + @keyframes pulse { + 0% { transform: scale(1); } + 50% { transform: scale(1.05); } + 100% { transform: scale(1); } + } + + @keyframes ripple { + 0% { + transform: scale(1); + opacity: 1; + } + 100% { + transform: scale(1.2); + opacity: 0; + } + } + + .voice-status { + position: absolute; + top: -25px; + left: 50%; + transform: translateX(-50%); + background-color: rgba(0,0,0,0.8); + color: white; + padding: 4px 8px; + border-radius: 4px; + font-size: 12px; + white-space: nowrap; + display: none; + } + + .voice-button.recording .voice-status { + display: block; + } + .query-response { margin-top: 15px; padding: 10px; @@ -218,6 +294,12 @@

Ask a Question

+ {% if has_voice %} + + {% endif %}
@@ -277,6 +359,72 @@

{{ key.replace('_', ' ').title() }}

window.location.reload(true); } + // Voice recording functionality + {% if has_voice %} + let mediaRecorder; + let chunks = []; + const voiceBtn = document.getElementById('voiceButton'); + const queryResponse = document.getElementById('queryResponse'); + + voiceBtn.addEventListener('click', async () => { + if (mediaRecorder && mediaRecorder.state === 'recording') { + // Stop recording + mediaRecorder.stop(); + voiceBtn.classList.remove('recording'); + } else { + // Start recording + try { + if (!mediaRecorder) { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + mediaRecorder = new MediaRecorder(stream); + + mediaRecorder.ondataavailable = e => chunks.push(e.data); + + mediaRecorder.onstop = async () => { + const blob = new Blob(chunks, { type: 'audio/webm' }); + chunks = []; + + // Show uploading status + queryResponse.textContent = 'Processing voice command...'; + queryResponse.className = 'query-response'; + queryResponse.style.display = 'block'; + + const formData = new FormData(); + formData.append('file', blob, 'recording.webm'); + + try { + const res = await fetch('/upload_audio', { + method: 'POST', + body: formData + }); + const json = await res.json(); + + if (json.success) { + queryResponse.textContent = 'Voice command received!'; + queryResponse.className = 'query-response success'; + setTimeout(() => { + queryResponse.style.display = 'none'; + }, 3000); + } else { + queryResponse.textContent = 'Error: ' + json.message; + queryResponse.className = 'query-response error'; + } + } catch (err) { + queryResponse.textContent = 'Upload failed: ' + err.message; + queryResponse.className = 'query-response error'; + } + }; + } + + mediaRecorder.start(); + voiceBtn.classList.add('recording'); + } catch (err) { + alert('Microphone access denied. Please allow microphone access to use voice commands.'); + } + } + }); + {% endif %} + // Handle query form submission document.getElementById('queryForm').addEventListener('submit', async function(e) { e.preventDefault(); diff --git a/dimos/web/robot_web_interface.py b/dimos/web/robot_web_interface.py index 72dcce9d29..33847c0056 100644 --- a/dimos/web/robot_web_interface.py +++ b/dimos/web/robot_web_interface.py @@ -23,12 +23,13 @@ class RobotWebInterface(FastAPIServer): """Wrapper class for the dimos-interface FastAPI server.""" - def __init__(self, port=5555, text_streams=None, **streams): + def __init__(self, port=5555, text_streams=None, audio_subject=None, **streams): super().__init__( dev_name="Robot Web Interface", edge_type="Bidirectional", host="0.0.0.0", port=port, text_streams=text_streams, + audio_subject=audio_subject, **streams, ) diff --git a/tests/run.py b/tests/run.py index 72b05ea2df..977742ba34 100644 --- a/tests/run.py +++ b/tests/run.py @@ -24,7 +24,6 @@ # from dimos.robot.unitree.unitree_ros_control import UnitreeROSControl from dimos.robot.unitree.unitree_skills import MyUnitreeSkills from dimos.web.robot_web_interface import RobotWebInterface -from dimos.web.voice_web_interface import VoiceWebInterface from dimos.web.websocket_vis.server import WebsocketVis from dimos.skills.observe_stream import ObserveStream from dimos.skills.observe import Observe @@ -212,6 +211,7 @@ def newmap(msg): agent_response_subject = rx.subject.Subject() agent_response_stream = agent_response_subject.pipe(ops.share()) local_planner_viz_stream = robot.local_planner_viz_stream.pipe(ops.share()) +audio_subject = rx.subject.Subject() # Initialize object detection stream min_confidence = 0.6 @@ -281,13 +281,12 @@ def combine_with_locations(object_detections): "agent_responses": agent_response_stream, } -web_interface = RobotWebInterface(port=5555, text_streams=text_streams, **streams) - -# Initialize voice web interface -voice_web_interface = VoiceWebInterface(port=5560) +web_interface = RobotWebInterface( + port=5555, text_streams=text_streams, audio_subject=audio_subject, **streams +) stt_node = stt() -stt_node.consume_audio(voice_web_interface.emit_audio()) +stt_node.consume_audio(audio_subject.pipe(ops.share())) # Read system query from prompt.txt file with open( @@ -344,11 +343,6 @@ def combine_with_locations(object_detections): web_thread.daemon = True web_thread.start() -# Start voice web interface in a separate thread to avoid blocking -voice_thread = threading.Thread(target=voice_web_interface.run) -voice_thread.daemon = True -voice_thread.start() - try: while True: # Main loop - can add robot movement or other logic here From cb64e92db05d32c474768dac9a8012a28d7171b7 Mon Sep 17 00:00:00 2001 From: stash Date: Thu, 10 Jul 2025 05:59:06 -0700 Subject: [PATCH 08/18] Voice streaming added to dimOS terminal interface fully tested --- dimos/web/dimos_interface/src/App.svelte | 22 ++ .../src/components/Input.svelte | 60 +++-- .../src/components/VoiceButton.svelte | 255 ++++++++++++++++++ 3 files changed, 309 insertions(+), 28 deletions(-) create mode 100644 dimos/web/dimos_interface/src/components/VoiceButton.svelte diff --git a/dimos/web/dimos_interface/src/App.svelte b/dimos/web/dimos_interface/src/App.svelte index a1ddae4931..c249f3e3ea 100644 --- a/dimos/web/dimos_interface/src/App.svelte +++ b/dimos/web/dimos_interface/src/App.svelte @@ -3,7 +3,27 @@ import Input from './components/Input.svelte'; import History from './components/History.svelte'; import StreamViewer from './components/StreamViewer.svelte'; + import VoiceButton from './components/VoiceButton.svelte'; import { theme } from './stores/theme'; + import { history } from './stores/history'; + + const handleVoiceCommand = async (event: CustomEvent) => { + if (event.detail.success) { + // Show voice processing message + history.update(h => [...h, { + command: '[voice command]', + outputs: ['Processing voice command...'] + }]); + + // The actual command will be processed by the agent through the audio pipeline + // and will appear in the text stream + } else { + history.update(h => [...h, { + command: '[voice command]', + outputs: [`Error: ${event.detail.error}`] + }]); + } + }; @@ -29,3 +49,5 @@ + + diff --git a/dimos/web/dimos_interface/src/components/Input.svelte b/dimos/web/dimos_interface/src/components/Input.svelte index 86eba9184d..d7cee270f1 100644 --- a/dimos/web/dimos_interface/src/components/Input.svelte +++ b/dimos/web/dimos_interface/src/components/Input.svelte @@ -29,34 +29,7 @@ const handleKeyDown = async (event: KeyboardEvent) => { if (event.key === 'Enter') { - const [commandName, ...args] = command.split(' '); - - if (import.meta.env.VITE_TRACKING_ENABLED === 'true') { - track(commandName, ...args); - } - - const commandFunction = commands[commandName]; - - if (commandFunction) { - const output = await commandFunction(args); - - if (commandName !== 'clear') { - if (output && typeof output === 'object' && 'type' in output && output.type === 'STREAM_START') { - // Add initial message to history - $history = [...$history, { command, outputs: [output.initialMessage] }]; - // Connect to text stream - connectTextStream(output.streamKey); - } else { - $history = [...$history, { command, outputs: [output] }]; - } - } - } else { - const output = `${commandName}: command not found`; - $history = [...$history, { command, outputs: [output] }]; - } - - command = ''; - historyIndex = -1; + await executeCommand(); } else if (event.key === 'ArrowUp') { if (historyIndex < $history.length - 1) { historyIndex++; @@ -80,6 +53,37 @@ $history = []; } }; + + const executeCommand = async () => { + const [commandName, ...args] = command.split(' '); + + if (import.meta.env.VITE_TRACKING_ENABLED === 'true') { + track(commandName, ...args); + } + + const commandFunction = commands[commandName]; + + if (commandFunction) { + const output = await commandFunction(args); + + if (commandName !== 'clear') { + if (output && typeof output === 'object' && 'type' in output && output.type === 'STREAM_START') { + // Add initial message to history + $history = [...$history, { command, outputs: [output.initialMessage] }]; + // Connect to text stream + connectTextStream(output.streamKey); + } else { + $history = [...$history, { command, outputs: [output] }]; + } + } + } else { + const output = `${commandName}: command not found`; + $history = [...$history, { command, outputs: [output] }]; + } + + command = ''; + historyIndex = -1; + }; + + + + + + + + \ No newline at end of file From ef9ad98fddeb53a59f1d545f23ab5f94089579eb Mon Sep 17 00:00:00 2001 From: stash Date: Thu, 10 Jul 2025 06:37:29 -0700 Subject: [PATCH 09/18] Expose interface to all network connections --- dimos/web/dimos_interface/vite.config.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/dimos/web/dimos_interface/vite.config.ts b/dimos/web/dimos_interface/vite.config.ts index 296050256a..29be79dd4a 100644 --- a/dimos/web/dimos_interface/vite.config.ts +++ b/dimos/web/dimos_interface/vite.config.ts @@ -22,6 +22,7 @@ export default defineConfig({ plugins: [svelte()], server: { port: 3000, + host: '0.0.0.0', watch: { // Exclude node_modules, .git and other large directories ignored: ['**/node_modules/**', '**/.git/**', '**/dist/**', 'lambda/**'], From e27c85fa79f2e619426d914844dbb8d250cab9c4 Mon Sep 17 00:00:00 2001 From: stash Date: Thu, 10 Jul 2025 07:55:27 -0700 Subject: [PATCH 10/18] UnitreeSpeak skill on the robot working and speed optimized over webrtc --- dimos/skills/unitree/__init__.py | 1 + dimos/skills/unitree/unitree_speak.py | 296 ++++++++++++++++++++++++++ tests/run.py | 6 +- 3 files changed, 300 insertions(+), 3 deletions(-) create mode 100644 dimos/skills/unitree/__init__.py create mode 100644 dimos/skills/unitree/unitree_speak.py diff --git a/dimos/skills/unitree/__init__.py b/dimos/skills/unitree/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/dimos/skills/unitree/__init__.py @@ -0,0 +1 @@ + diff --git a/dimos/skills/unitree/unitree_speak.py b/dimos/skills/unitree/unitree_speak.py new file mode 100644 index 0000000000..e121f3c5c2 --- /dev/null +++ b/dimos/skills/unitree/unitree_speak.py @@ -0,0 +1,296 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dimos.skills.skills import AbstractRobotSkill +from pydantic import Field +import time +import tempfile +import os +import json +import base64 +import hashlib +import soundfile as sf +import numpy as np +from openai import OpenAI +from dimos.utils.logging_config import setup_logger +from go2_webrtc_driver.constants import RTC_TOPIC + +logger = setup_logger("dimos.skills.unitree.unitree_speak") + +# Audio API constants (from go2_webrtc_driver) +AUDIO_API = { + "GET_AUDIO_LIST": 1001, + "SELECT_START_PLAY": 1002, + "PAUSE": 1003, + "UNSUSPEND": 1004, + "SET_PLAY_MODE": 1007, + "UPLOAD_AUDIO_FILE": 2001, + "ENTER_MEGAPHONE": 4001, + "EXIT_MEGAPHONE": 4002, + "UPLOAD_MEGAPHONE": 4003, +} + +PLAY_MODES = { + "NO_CYCLE": "no_cycle", + "SINGLE_CYCLE": "single_cycle", + "LIST_LOOP": "list_loop" +} + + +class UnitreeSpeak(AbstractRobotSkill): + """Speak text out loud through the robot's speakers using WebRTC audio upload.""" + + text: str = Field(..., description="Text to speak") + voice: str = Field(default="echo", description="Voice to use (alloy, echo, fable, onyx, nova, shimmer)") + speed: float = Field(default=1.2, description="Speech speed (0.25 to 4.0)") + use_megaphone: bool = Field(default=False, description="Use megaphone mode for lower latency (experimental)") + + def __init__(self, **data): + super().__init__(**data) + self._openai_client = None + + def _get_openai_client(self): + if self._openai_client is None: + self._openai_client = OpenAI() + return self._openai_client + + def _generate_audio(self, text: str) -> bytes: + try: + client = self._get_openai_client() + response = client.audio.speech.create( + model="tts-1", + voice=self.voice, + input=text, + speed=self.speed, + response_format="mp3" + ) + return response.content + except Exception as e: + logger.error(f"Error generating audio: {e}") + raise + + def _webrtc_request(self, api_id: int, parameter: dict = None): + if parameter is None: + parameter = {} + + request_data = { + "api_id": api_id, + "parameter": json.dumps(parameter) if parameter else "{}" + } + + return self._robot.webrtc_connection.publish_request( + RTC_TOPIC["AUDIO_HUB_REQ"], + request_data + ) + + def _upload_audio_to_robot(self, audio_data: bytes, filename: str) -> str: + try: + file_md5 = hashlib.md5(audio_data).hexdigest() + b64_data = base64.b64encode(audio_data).decode('utf-8') + + chunk_size = 61440 + chunks = [b64_data[i:i + chunk_size] for i in range(0, len(b64_data), chunk_size)] + total_chunks = len(chunks) + + logger.info(f"Uploading audio '{filename}' in {total_chunks} chunks (optimized)") + + for i, chunk in enumerate(chunks, 1): + parameter = { + 'file_name': filename, + 'file_type': 'wav', + 'file_size': len(audio_data), + 'current_block_index': i, + 'total_block_number': total_chunks, + 'block_content': chunk, + 'current_block_size': len(chunk), + 'file_md5': file_md5, + 'create_time': int(time.time() * 1000) + } + + logger.debug(f"Sending chunk {i}/{total_chunks}") + response = self._webrtc_request( + AUDIO_API["UPLOAD_AUDIO_FILE"], + parameter + ) + + logger.info(f"Audio upload completed for '{filename}'") + + list_response = self._webrtc_request(AUDIO_API["GET_AUDIO_LIST"], {}) + + if list_response and 'data' in list_response: + data_str = list_response.get('data', {}).get('data', '{}') + audio_list = json.loads(data_str).get('audio_list', []) + + for audio in audio_list: + if audio.get('CUSTOM_NAME') == filename: + return audio.get('UNIQUE_ID') + + logger.warning(f"Could not find uploaded audio '{filename}' in list, using filename as UUID") + return filename + + except Exception as e: + logger.error(f"Error uploading audio to robot: {e}") + raise + + def _play_audio_on_robot(self, uuid: str): + try: + self._webrtc_request( + AUDIO_API["SET_PLAY_MODE"], + {'play_mode': PLAY_MODES["NO_CYCLE"]} + ) + time.sleep(0.1) + + parameter = {'unique_id': uuid} + + logger.info(f"Playing audio with UUID: {uuid}") + self._webrtc_request( + AUDIO_API["SELECT_START_PLAY"], + parameter + ) + + except Exception as e: + logger.error(f"Error playing audio on robot: {e}") + raise + + def _stop_audio_playback(self): + try: + logger.debug("Stopping audio playback") + self._webrtc_request(AUDIO_API["PAUSE"], {}) + except Exception as e: + logger.warning(f"Error stopping audio playback: {e}") + + def _upload_and_play_megaphone(self, audio_data: bytes, duration: float): + try: + logger.debug("Entering megaphone mode") + self._webrtc_request(AUDIO_API["ENTER_MEGAPHONE"], {}) + + time.sleep(0.2) + + b64_data = base64.b64encode(audio_data).decode('utf-8') + + chunk_size = 4096 + chunks = [b64_data[i:i + chunk_size] for i in range(0, len(b64_data), chunk_size)] + total_chunks = len(chunks) + + logger.info(f"Uploading megaphone audio in {total_chunks} chunks") + + for i, chunk in enumerate(chunks, 1): + parameter = { + 'current_block_size': len(chunk), + 'block_content': chunk, + 'current_block_index': i, + 'total_block_number': total_chunks + } + + logger.debug(f"Sending megaphone chunk {i}/{total_chunks}") + self._webrtc_request( + AUDIO_API["UPLOAD_MEGAPHONE"], + parameter + ) + + if i < total_chunks: + time.sleep(0.05) + + logger.info("Megaphone audio upload completed, waiting for playback") + + time.sleep(duration + 1.0) + + except Exception as e: + logger.error(f"Error in megaphone mode: {e}") + try: + self._webrtc_request(AUDIO_API["EXIT_MEGAPHONE"], {}) + except: + pass + raise + finally: + try: + logger.debug("Exiting megaphone mode") + self._webrtc_request(AUDIO_API["EXIT_MEGAPHONE"], {}) + time.sleep(0.1) + except Exception as e: + logger.warning(f"Error exiting megaphone mode: {e}") + + def __call__(self): + super().__call__() + + if not self._robot: + logger.error("No robot instance provided to UnitreeSpeak skill") + return "Error: No robot instance available" + + try: + display_text = self.text[:50] + "..." if len(self.text) > 50 else self.text + logger.info(f"Speaking: '{display_text}'") + + logger.debug("Generating audio with OpenAI TTS") + audio_data = self._generate_audio(self.text) + + with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_mp3: + tmp_mp3.write(audio_data) + tmp_mp3_path = tmp_mp3.name + + try: + audio_array, sample_rate = sf.read(tmp_mp3_path) + + if audio_array.ndim > 1: + audio_array = np.mean(audio_array, axis=1) + + target_sample_rate = 22050 + if sample_rate != target_sample_rate: + logger.debug(f"Resampling from {sample_rate}Hz to {target_sample_rate}Hz") + old_length = len(audio_array) + new_length = int(old_length * target_sample_rate / sample_rate) + old_indices = np.arange(old_length) + new_indices = np.linspace(0, old_length - 1, new_length) + audio_array = np.interp(new_indices, old_indices, audio_array) + sample_rate = target_sample_rate + + audio_array = audio_array / np.max(np.abs(audio_array)) + + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_wav: + sf.write(tmp_wav.name, audio_array, sample_rate, format='WAV', subtype='PCM_16') + tmp_wav.seek(0) + wav_data = open(tmp_wav.name, 'rb').read() + os.unlink(tmp_wav.name) + + logger.info(f"Audio size: {len(wav_data) / 1024:.1f}KB, duration: {len(audio_array) / sample_rate:.1f}s") + + finally: + os.unlink(tmp_mp3_path) + + if self.use_megaphone: + logger.debug("Using megaphone mode for lower latency") + duration = len(audio_array) / sample_rate + self._upload_and_play_megaphone(wav_data, duration) + + return f"Spoke: '{display_text}' on robot successfully (megaphone mode)" + else: + filename = f"speak_{int(time.time() * 1000)}" + + logger.debug("Uploading audio to robot") + uuid = self._upload_audio_to_robot(wav_data, filename) + + logger.debug("Playing audio on robot") + self._play_audio_on_robot(uuid) + + duration = len(audio_array) / sample_rate + logger.debug(f"Waiting {duration:.1f}s for playback to complete") + # time.sleep(duration + 0.2) + + # self._stop_audio_playback() + + return f"Spoke: '{display_text}' on robot successfully" + + except Exception as e: + logger.error(f"Error in speak skill: {e}") + return f"Error speaking text: {str(e)}" diff --git a/tests/run.py b/tests/run.py index 977742ba34..9ae6f81398 100644 --- a/tests/run.py +++ b/tests/run.py @@ -36,7 +36,7 @@ import threading import json from dimos.types.vector import Vector -from dimos.skills.speak import Speak +from dimos.skills.unitree.unitree_speak import UnitreeSpeak from dimos.perception.object_detection_stream import ObjectDetectionStream from dimos.perception.detection2d.detic_2d_det import Detic2DDetector @@ -318,7 +318,7 @@ def combine_with_locations(object_detections): robot_skills.add(NavigateWithText) # robot_skills.add(FollowHuman) # TODO: broken robot_skills.add(GetPose) -# robot_skills.add(Speak) +robot_skills.add(UnitreeSpeak) # Re-enable Speak skill robot_skills.add(NavigateToGoal) robot_skills.add(Explore) @@ -330,7 +330,7 @@ def combine_with_locations(object_detections): robot_skills.create_instance("GetPose", robot=robot) robot_skills.create_instance("NavigateToGoal", robot=robot) robot_skills.create_instance("Explore", robot=robot) -# robot_skills.create_instance("Speak", tts_node=tts_node) +robot_skills.create_instance("UnitreeSpeak", robot=robot) # Now only needs robot instance # Subscribe to agent responses and send them to the subject agent.get_response_observable().subscribe(lambda x: agent_response_subject.on_next(x)) From f42f372e6952262d0fc3f6a0391fd99f229c1b99 Mon Sep 17 00:00:00 2001 From: spomichter <12108168+spomichter@users.noreply.github.com> Date: Thu, 10 Jul 2025 14:56:07 +0000 Subject: [PATCH 11/18] CI code cleanup --- dimos/skills/unitree/unitree_speak.py | 194 ++++++++++++-------------- 1 file changed, 89 insertions(+), 105 deletions(-) diff --git a/dimos/skills/unitree/unitree_speak.py b/dimos/skills/unitree/unitree_speak.py index e121f3c5c2..05004398f9 100644 --- a/dimos/skills/unitree/unitree_speak.py +++ b/dimos/skills/unitree/unitree_speak.py @@ -41,20 +41,20 @@ "UPLOAD_MEGAPHONE": 4003, } -PLAY_MODES = { - "NO_CYCLE": "no_cycle", - "SINGLE_CYCLE": "single_cycle", - "LIST_LOOP": "list_loop" -} +PLAY_MODES = {"NO_CYCLE": "no_cycle", "SINGLE_CYCLE": "single_cycle", "LIST_LOOP": "list_loop"} class UnitreeSpeak(AbstractRobotSkill): """Speak text out loud through the robot's speakers using WebRTC audio upload.""" text: str = Field(..., description="Text to speak") - voice: str = Field(default="echo", description="Voice to use (alloy, echo, fable, onyx, nova, shimmer)") + voice: str = Field( + default="echo", description="Voice to use (alloy, echo, fable, onyx, nova, shimmer)" + ) speed: float = Field(default=1.2, description="Speech speed (0.25 to 4.0)") - use_megaphone: bool = Field(default=False, description="Use megaphone mode for lower latency (experimental)") + use_megaphone: bool = Field( + default=False, description="Use megaphone mode for lower latency (experimental)" + ) def __init__(self, **data): super().__init__(**data) @@ -69,11 +69,7 @@ def _generate_audio(self, text: str) -> bytes: try: client = self._get_openai_client() response = client.audio.speech.create( - model="tts-1", - voice=self.voice, - input=text, - speed=self.speed, - response_format="mp3" + model="tts-1", voice=self.voice, input=text, speed=self.speed, response_format="mp3" ) return response.content except Exception as e: @@ -83,82 +79,71 @@ def _generate_audio(self, text: str) -> bytes: def _webrtc_request(self, api_id: int, parameter: dict = None): if parameter is None: parameter = {} - - request_data = { - "api_id": api_id, - "parameter": json.dumps(parameter) if parameter else "{}" - } - + + request_data = {"api_id": api_id, "parameter": json.dumps(parameter) if parameter else "{}"} + return self._robot.webrtc_connection.publish_request( - RTC_TOPIC["AUDIO_HUB_REQ"], - request_data + RTC_TOPIC["AUDIO_HUB_REQ"], request_data ) def _upload_audio_to_robot(self, audio_data: bytes, filename: str) -> str: try: file_md5 = hashlib.md5(audio_data).hexdigest() - b64_data = base64.b64encode(audio_data).decode('utf-8') - + b64_data = base64.b64encode(audio_data).decode("utf-8") + chunk_size = 61440 - chunks = [b64_data[i:i + chunk_size] for i in range(0, len(b64_data), chunk_size)] + chunks = [b64_data[i : i + chunk_size] for i in range(0, len(b64_data), chunk_size)] total_chunks = len(chunks) - + logger.info(f"Uploading audio '{filename}' in {total_chunks} chunks (optimized)") for i, chunk in enumerate(chunks, 1): parameter = { - 'file_name': filename, - 'file_type': 'wav', - 'file_size': len(audio_data), - 'current_block_index': i, - 'total_block_number': total_chunks, - 'block_content': chunk, - 'current_block_size': len(chunk), - 'file_md5': file_md5, - 'create_time': int(time.time() * 1000) + "file_name": filename, + "file_type": "wav", + "file_size": len(audio_data), + "current_block_index": i, + "total_block_number": total_chunks, + "block_content": chunk, + "current_block_size": len(chunk), + "file_md5": file_md5, + "create_time": int(time.time() * 1000), } - + logger.debug(f"Sending chunk {i}/{total_chunks}") - response = self._webrtc_request( - AUDIO_API["UPLOAD_AUDIO_FILE"], - parameter - ) - + response = self._webrtc_request(AUDIO_API["UPLOAD_AUDIO_FILE"], parameter) + logger.info(f"Audio upload completed for '{filename}'") - + list_response = self._webrtc_request(AUDIO_API["GET_AUDIO_LIST"], {}) - - if list_response and 'data' in list_response: - data_str = list_response.get('data', {}).get('data', '{}') - audio_list = json.loads(data_str).get('audio_list', []) - + + if list_response and "data" in list_response: + data_str = list_response.get("data", {}).get("data", "{}") + audio_list = json.loads(data_str).get("audio_list", []) + for audio in audio_list: - if audio.get('CUSTOM_NAME') == filename: - return audio.get('UNIQUE_ID') - - logger.warning(f"Could not find uploaded audio '{filename}' in list, using filename as UUID") + if audio.get("CUSTOM_NAME") == filename: + return audio.get("UNIQUE_ID") + + logger.warning( + f"Could not find uploaded audio '{filename}' in list, using filename as UUID" + ) return filename - + except Exception as e: logger.error(f"Error uploading audio to robot: {e}") raise def _play_audio_on_robot(self, uuid: str): try: - self._webrtc_request( - AUDIO_API["SET_PLAY_MODE"], - {'play_mode': PLAY_MODES["NO_CYCLE"]} - ) + self._webrtc_request(AUDIO_API["SET_PLAY_MODE"], {"play_mode": PLAY_MODES["NO_CYCLE"]}) time.sleep(0.1) - - parameter = {'unique_id': uuid} - + + parameter = {"unique_id": uuid} + logger.info(f"Playing audio with UUID: {uuid}") - self._webrtc_request( - AUDIO_API["SELECT_START_PLAY"], - parameter - ) - + self._webrtc_request(AUDIO_API["SELECT_START_PLAY"], parameter) + except Exception as e: logger.error(f"Error playing audio on robot: {e}") raise @@ -174,38 +159,35 @@ def _upload_and_play_megaphone(self, audio_data: bytes, duration: float): try: logger.debug("Entering megaphone mode") self._webrtc_request(AUDIO_API["ENTER_MEGAPHONE"], {}) - + time.sleep(0.2) - - b64_data = base64.b64encode(audio_data).decode('utf-8') - + + b64_data = base64.b64encode(audio_data).decode("utf-8") + chunk_size = 4096 - chunks = [b64_data[i:i + chunk_size] for i in range(0, len(b64_data), chunk_size)] + chunks = [b64_data[i : i + chunk_size] for i in range(0, len(b64_data), chunk_size)] total_chunks = len(chunks) - + logger.info(f"Uploading megaphone audio in {total_chunks} chunks") for i, chunk in enumerate(chunks, 1): parameter = { - 'current_block_size': len(chunk), - 'block_content': chunk, - 'current_block_index': i, - 'total_block_number': total_chunks + "current_block_size": len(chunk), + "block_content": chunk, + "current_block_index": i, + "total_block_number": total_chunks, } - + logger.debug(f"Sending megaphone chunk {i}/{total_chunks}") - self._webrtc_request( - AUDIO_API["UPLOAD_MEGAPHONE"], - parameter - ) - + self._webrtc_request(AUDIO_API["UPLOAD_MEGAPHONE"], parameter) + if i < total_chunks: time.sleep(0.05) - + logger.info("Megaphone audio upload completed, waiting for playback") - + time.sleep(duration + 1.0) - + except Exception as e: logger.error(f"Error in megaphone mode: {e}") try: @@ -223,28 +205,28 @@ def _upload_and_play_megaphone(self, audio_data: bytes, duration: float): def __call__(self): super().__call__() - + if not self._robot: logger.error("No robot instance provided to UnitreeSpeak skill") return "Error: No robot instance available" - + try: display_text = self.text[:50] + "..." if len(self.text) > 50 else self.text logger.info(f"Speaking: '{display_text}'") - + logger.debug("Generating audio with OpenAI TTS") audio_data = self._generate_audio(self.text) - - with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_mp3: + + with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp_mp3: tmp_mp3.write(audio_data) tmp_mp3_path = tmp_mp3.name - + try: audio_array, sample_rate = sf.read(tmp_mp3_path) - + if audio_array.ndim > 1: audio_array = np.mean(audio_array, axis=1) - + target_sample_rate = 22050 if sample_rate != target_sample_rate: logger.debug(f"Resampling from {sample_rate}Hz to {target_sample_rate}Hz") @@ -254,43 +236,45 @@ def __call__(self): new_indices = np.linspace(0, old_length - 1, new_length) audio_array = np.interp(new_indices, old_indices, audio_array) sample_rate = target_sample_rate - + audio_array = audio_array / np.max(np.abs(audio_array)) - - with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_wav: - sf.write(tmp_wav.name, audio_array, sample_rate, format='WAV', subtype='PCM_16') + + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav: + sf.write(tmp_wav.name, audio_array, sample_rate, format="WAV", subtype="PCM_16") tmp_wav.seek(0) - wav_data = open(tmp_wav.name, 'rb').read() + wav_data = open(tmp_wav.name, "rb").read() os.unlink(tmp_wav.name) - - logger.info(f"Audio size: {len(wav_data) / 1024:.1f}KB, duration: {len(audio_array) / sample_rate:.1f}s") - + + logger.info( + f"Audio size: {len(wav_data) / 1024:.1f}KB, duration: {len(audio_array) / sample_rate:.1f}s" + ) + finally: os.unlink(tmp_mp3_path) - + if self.use_megaphone: logger.debug("Using megaphone mode for lower latency") duration = len(audio_array) / sample_rate self._upload_and_play_megaphone(wav_data, duration) - + return f"Spoke: '{display_text}' on robot successfully (megaphone mode)" else: filename = f"speak_{int(time.time() * 1000)}" - + logger.debug("Uploading audio to robot") uuid = self._upload_audio_to_robot(wav_data, filename) - + logger.debug("Playing audio on robot") self._play_audio_on_robot(uuid) - + duration = len(audio_array) / sample_rate logger.debug(f"Waiting {duration:.1f}s for playback to complete") # time.sleep(duration + 0.2) - + # self._stop_audio_playback() - + return f"Spoke: '{display_text}' on robot successfully" - + except Exception as e: logger.error(f"Error in speak skill: {e}") return f"Error speaking text: {str(e)}" From 5c4f30522644f2175c8b3132190f0e8154f3d892 Mon Sep 17 00:00:00 2001 From: alexlin2 Date: Thu, 10 Jul 2025 13:38:46 -0700 Subject: [PATCH 12/18] fixed spatial memory cb bug, do recovery after few skills --- .../wavefront_frontier_goal_selector.py | 2 +- dimos/robot/unitree_webrtc/unitree_skills.py | 8 +------- dimos/skills/navigation.py | 6 ------ 3 files changed, 2 insertions(+), 14 deletions(-) diff --git a/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py b/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py index 5f9032aa28..76f2ddbb0a 100644 --- a/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py +++ b/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py @@ -83,7 +83,7 @@ def __init__( self, min_frontier_size: int = 10, occupancy_threshold: int = 65, - subsample_resolution: int = 3, + subsample_resolution: int = 2, min_distance_from_robot: float = 0.5, explored_area_buffer: float = 0.5, min_distance_from_obstacles: float = 0.6, diff --git a/dimos/robot/unitree_webrtc/unitree_skills.py b/dimos/robot/unitree_webrtc/unitree_skills.py index e6bc4ec759..f9dfc1eede 100644 --- a/dimos/robot/unitree_webrtc/unitree_skills.py +++ b/dimos/robot/unitree_webrtc/unitree_skills.py @@ -50,14 +50,8 @@ ( "RecoveryStand", 1006, - "Recovers the robot to a state from which it can take more commands. Useful to run after multiple dynamic commands like front flips.", + "Recovers the robot to a state from which it can take more commands. Useful to run after multiple dynamic commands like front flips, Must run after skills like sit and jump and standup.", ), - ( - "Euler", - 1007, - "Adjusts the robot's orientation using Euler angles, providing precise control over its rotation.", - ), - # ("Move", 1008, "Move the robot using velocity commands."), # Handled separately ("Sit", 1009, "Commands the robot to sit down from a standing or moving stance."), ( "RiseSit", diff --git a/dimos/skills/navigation.py b/dimos/skills/navigation.py index 114e3dacb8..b24ee518c1 100644 --- a/dimos/skills/navigation.py +++ b/dimos/skills/navigation.py @@ -405,12 +405,6 @@ def stop(self): logger.error(f"Error disposing navigation task: {e}") self._navigation_disposable = None - # Clean up spatial memory if it exists - if hasattr(self, "_spatial_memory") and self._spatial_memory is not None: - logger.info("Cleaning up spatial memory") - self._spatial_memory.cleanup() - self._spatial_memory = None - return "Navigate skill stopped successfully." From a260626c65fc0ad08128367e0850e4c55b5ef93d Mon Sep 17 00:00:00 2001 From: alexlin2 Date: Thu, 10 Jul 2025 18:15:15 -0700 Subject: [PATCH 13/18] added some prompt changes, added new skip_visual_search flag to navigateWithText --- assets/agent/prompt.txt | 8 +++++++- dimos/skills/navigation.py | 38 +++++++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/assets/agent/prompt.txt b/assets/agent/prompt.txt index 8a2ccff020..4987320263 100644 --- a/assets/agent/prompt.txt +++ b/assets/agent/prompt.txt @@ -64,7 +64,13 @@ Saved Robot Locations: ***ALWAYS CHECK FIRST if you can find a navigation query in the Saved Robot Locations before running the NavigateWithText tool call. If a saved location is found, get there with NavigateToGoal.*** -***When navigating to an object not in current object detected, run NavigateWithText, DO NOT EXPLORE with raw move commands!!! +***Don't use object detections for navigating to an object, PRIORITIZE running NavigateWithText. Only use object detections if NavigateWithText fails*** + +***When running NavigateWithText, set skip_visual_search flag to TRUE if the query is a general location such as kitchen or office, if it fails, then run without this flag*** + +***When navigating to an object not in current object detected, run NavigateWithText, DO NOT EXPLORE with raw move commands!!!*** + +***Prioritize on using the query output from observe skill rather than object detections when the user gives a specific visual query such as "Is there a person wearing a red hat in the view?"*** PLANNING & REASONING: - You can develop both short-term and long-term plans to achieve complex goals diff --git a/dimos/skills/navigation.py b/dimos/skills/navigation.py index b24ee518c1..6d67ae04f2 100644 --- a/dimos/skills/navigation.py +++ b/dimos/skills/navigation.py @@ -60,18 +60,24 @@ class NavigateWithText(AbstractRobotSkill): This skill first attempts to locate an object in the robot's camera view using vision. If the object is found, it navigates to it. If not, it falls back to querying the - semantic map for a location matching the description. For example, "Find the kitchen" - will first look for a kitchen in view, then check the semantic map coordinates where - a kitchen was previously observed. + semantic map for a location matching the description. For example, "Find the Teddy Bear" + will first look for a Teddy Bear in view, then check the semantic map coordinates where + a Teddy Bear was previously observed. CALL THIS SKILL FOR ONE SUBJECT AT A TIME. For example: "Go to the person wearing a blue shirt in the living room", you should call this skill twice, once for the person wearing a blue shirt and once for the living room. + + If skip_visual_search is True, this skill will skip the visual search for the object in view. + This is useful if you want to navigate to a general location such as a kitchen or office. + For example, "Go to the kitchen" will not look for a kitchen in view, but will check the semantic map coordinates where + a kitchen was previously observed. """ query: str = Field("", description="Text query to search for in the semantic map") limit: int = Field(1, description="Maximum number of results to return") distance: float = Field(1.0, description="Desired distance to maintain from object in meters") + skip_visual_search: bool = Field(False, description="Skip visual search for object in view") timeout: float = Field(40.0, description="Maximum time to spend navigating in seconds") def __init__(self, robot=None, **data): @@ -362,22 +368,24 @@ def __call__(self): # First, try to find and navigate to the object in camera view logger.info(f"First attempting to find and navigate to visible object: '{self.query}'") - object_result = self._navigate_to_object() - if object_result and object_result["success"]: - logger.info(f"Successfully navigated to {self.query} in view") - return object_result + if not self.skip_visual_search: + object_result = self._navigate_to_object() + + if object_result and object_result["success"]: + logger.info(f"Successfully navigated to {self.query} in view") + return object_result + + elif object_result and object_result["failure_reason"] == "Navigation": + logger.info( + f"Failed to navigate to {self.query} in view: {object_result.get('error', 'Unknown error')}" + ) + return object_result - elif object_result and object_result["failure_reason"] == "Navigation": + # If object navigation failed, fall back to semantic map logger.info( - f"Failed to navigate to {self.query} in view: {object_result.get('error', 'Unknown error')}" + f"Object not found in view. Falling back to semantic map query for: '{self.query}'" ) - return object_result - - # If object navigation failed, fall back to semantic map - logger.info( - f"Object not found in view. Falling back to semantic map query for: '{self.query}'" - ) return self._navigate_using_semantic_map() From 5674352755bd3641fc302935f19009dc32be4b8a Mon Sep 17 00:00:00 2001 From: stash Date: Thu, 10 Jul 2025 18:20:00 -0700 Subject: [PATCH 14/18] Fixed observe skill --- dimos/models/qwen/video_query.py | 2 +- dimos/skills/observe.py | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/dimos/models/qwen/video_query.py b/dimos/models/qwen/video_query.py index 3471e52268..7eda5f1aed 100644 --- a/dimos/models/qwen/video_query.py +++ b/dimos/models/qwen/video_query.py @@ -128,7 +128,7 @@ def query_single_frame( openai_client=qwen_client, model_name=model_name, tokenizer=HuggingFaceTokenizer(model_name=f"Qwen/{model_name}"), - max_output_tokens_per_request=100, + max_output_tokens_per_request=8192, system_query=query, pool_scheduler=get_scheduler(), ) diff --git a/dimos/skills/observe.py b/dimos/skills/observe.py index 844df11805..e4c9a1493c 100644 --- a/dimos/skills/observe.py +++ b/dimos/skills/observe.py @@ -106,14 +106,8 @@ def __call__(self): # Process the frame with Qwen response = self._process_frame_with_qwen(frame) - # Add the response to the conversation history - # self._agent.append_to_history( - # f"Observation: {response}", - # ) - response = self._agent.run_observable_query(f"Observation: {response}") - logger.info(f"Added Qwen observation to conversation history") - return f"Observation complete: {response[:100]}..." + return f"Observation complete: {response}" except Exception as e: error_msg = f"Error in Observe skill: {e}" @@ -127,6 +121,10 @@ def _get_frame_from_stream(self): Returns: A single frame from the video stream, or None if no frame is available """ + if self._video_stream is None: + logger.error("Video stream is None") + return None + frame = None frame_subject = rx.subject.Subject() From be5450fb591740e91572db0d8b0c270d8929427e Mon Sep 17 00:00:00 2001 From: spomichter <12108168+spomichter@users.noreply.github.com> Date: Fri, 11 Jul 2025 01:21:53 +0000 Subject: [PATCH 15/18] CI code cleanup --- dimos/skills/observe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dimos/skills/observe.py b/dimos/skills/observe.py index e4c9a1493c..067307353a 100644 --- a/dimos/skills/observe.py +++ b/dimos/skills/observe.py @@ -124,7 +124,7 @@ def _get_frame_from_stream(self): if self._video_stream is None: logger.error("Video stream is None") return None - + frame = None frame_subject = rx.subject.Subject() From ee97c9604110d34cfe82af767fdedb89de6a629f Mon Sep 17 00:00:00 2001 From: stash Date: Thu, 10 Jul 2025 18:57:23 -0700 Subject: [PATCH 16/18] Fix CORS issue --- .../src/components/Input.svelte | 42 +++++++++---------- .../src/components/VoiceButton.svelte | 9 +++- .../web/dimos_interface/src/stores/stream.ts | 13 ++++-- 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/dimos/web/dimos_interface/src/components/Input.svelte b/dimos/web/dimos_interface/src/components/Input.svelte index d7cee270f1..3a2b515f3d 100644 --- a/dimos/web/dimos_interface/src/components/Input.svelte +++ b/dimos/web/dimos_interface/src/components/Input.svelte @@ -55,34 +55,34 @@ }; const executeCommand = async () => { - const [commandName, ...args] = command.split(' '); + const [commandName, ...args] = command.split(' '); - if (import.meta.env.VITE_TRACKING_ENABLED === 'true') { - track(commandName, ...args); - } + if (import.meta.env.VITE_TRACKING_ENABLED === 'true') { + track(commandName, ...args); + } - const commandFunction = commands[commandName]; + const commandFunction = commands[commandName]; - if (commandFunction) { - const output = await commandFunction(args); + if (commandFunction) { + const output = await commandFunction(args); - if (commandName !== 'clear') { - if (output && typeof output === 'object' && 'type' in output && output.type === 'STREAM_START') { - // Add initial message to history - $history = [...$history, { command, outputs: [output.initialMessage] }]; - // Connect to text stream - connectTextStream(output.streamKey); - } else { - $history = [...$history, { command, outputs: [output] }]; + if (commandName !== 'clear') { + if (output && typeof output === 'object' && 'type' in output && output.type === 'STREAM_START') { + // Add initial message to history + $history = [...$history, { command, outputs: [output.initialMessage] }]; + // Connect to text stream + connectTextStream(output.streamKey); + } else { + $history = [...$history, { command, outputs: [output] }]; + } } + } else { + const output = `${commandName}: command not found`; + $history = [...$history, { command, outputs: [output] }]; } - } else { - const output = `${commandName}: command not found`; - $history = [...$history, { command, outputs: [output] }]; - } - command = ''; - historyIndex = -1; + command = ''; + historyIndex = -1; }; diff --git a/dimos/web/dimos_interface/src/components/VoiceButton.svelte b/dimos/web/dimos_interface/src/components/VoiceButton.svelte index e09cec0672..0f9682519a 100644 --- a/dimos/web/dimos_interface/src/components/VoiceButton.svelte +++ b/dimos/web/dimos_interface/src/components/VoiceButton.svelte @@ -21,6 +21,13 @@ const dispatch = createEventDispatcher(); + // Get the server URL dynamically based on current location + const getServerUrl = () => { + // In production, use the same host as the frontend but on port 5555 + const hostname = window.location.hostname; + return `http://${hostname}:5555`; + }; + let isRecording = false; let mediaRecorder: MediaRecorder | null = null; let chunks: Blob[] = []; @@ -50,7 +57,7 @@ formData.append('file', blob, 'recording.webm'); try { - const res = await fetch('http://0.0.0.0:5555/upload_audio', { + const res = await fetch(`${getServerUrl()}/upload_audio`, { method: 'POST', body: formData }); diff --git a/dimos/web/dimos_interface/src/stores/stream.ts b/dimos/web/dimos_interface/src/stores/stream.ts index 4ea07a71d7..eee46f84bf 100644 --- a/dimos/web/dimos_interface/src/stores/stream.ts +++ b/dimos/web/dimos_interface/src/stores/stream.ts @@ -18,6 +18,13 @@ import { writable, derived, get } from 'svelte/store'; import { simulationManager, simulationStore } from '../utils/simulation'; import { history } from './history'; +// Get the server URL dynamically based on current location +const getServerUrl = () => { + // In production, use the same host as the frontend but on port 5555 + const hostname = window.location.hostname; + return `http://${hostname}:5555`; +}; + interface StreamState { isVisible: boolean; url: string | null; @@ -65,7 +72,7 @@ export const combinedStreamState = derived( // Function to fetch available streams async function fetchAvailableStreams(): Promise { try { - const response = await fetch('http://0.0.0.0:5555/streams', { + const response = await fetch(`${getServerUrl()}/streams`, { headers: { 'Accept': 'application/json' } @@ -100,7 +107,7 @@ export const showStream = async (streamKey?: string) => { streamStore.set({ isVisible: true, - url: 'http://0.0.0.0:5555', + url: getServerUrl(), streamKeys: selectedStreams, isLoading: false, error: null, @@ -134,7 +141,7 @@ export const connectTextStream = (key: string): void => { } // Create new EventSource - const eventSource = new EventSource(`http://0.0.0.0:5555/text_stream/${key}`); + const eventSource = new EventSource(`${getServerUrl()}/text_stream/${key}`); textEventSources[key] = eventSource; // Handle incoming messages eventSource.addEventListener('message', (event) => { From 56edcfb03e12e82deb9c6b0c0c57145e219c722c Mon Sep 17 00:00:00 2001 From: alexlin2 Date: Thu, 10 Jul 2025 19:13:30 -0700 Subject: [PATCH 17/18] ammended prompt --- assets/agent/prompt.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/assets/agent/prompt.txt b/assets/agent/prompt.txt index 4987320263..b3d159a7df 100644 --- a/assets/agent/prompt.txt +++ b/assets/agent/prompt.txt @@ -64,13 +64,13 @@ Saved Robot Locations: ***ALWAYS CHECK FIRST if you can find a navigation query in the Saved Robot Locations before running the NavigateWithText tool call. If a saved location is found, get there with NavigateToGoal.*** -***Don't use object detections for navigating to an object, PRIORITIZE running NavigateWithText. Only use object detections if NavigateWithText fails*** +***Don't use object detections for navigating to an object, ALWAYS run NavigateWithText. Only use object detections if NavigateWithText fails*** ***When running NavigateWithText, set skip_visual_search flag to TRUE if the query is a general location such as kitchen or office, if it fails, then run without this flag*** ***When navigating to an object not in current object detected, run NavigateWithText, DO NOT EXPLORE with raw move commands!!!*** -***Prioritize on using the query output from observe skill rather than object detections when the user gives a specific visual query such as "Is there a person wearing a red hat in the view?"*** +***The object detection list is not a comprehensive source of information, when given a visual query like "go to the person wearing a hat" or "Do you see a dog", always Prioritize running observe skill and NavigateWithText*** PLANNING & REASONING: - You can develop both short-term and long-term plans to achieve complex goals From 8fe7a8a5e7ecfc13cf4980ce36d027e4a37d51c6 Mon Sep 17 00:00:00 2001 From: stash Date: Fri, 11 Jul 2025 18:31:53 -0700 Subject: [PATCH 18/18] Delete unused standalone voice web interface --- dimos/web/voice_web_interface.py | 228 ------------------------------- 1 file changed, 228 deletions(-) delete mode 100644 dimos/web/voice_web_interface.py diff --git a/dimos/web/voice_web_interface.py b/dimos/web/voice_web_interface.py deleted file mode 100644 index c5ab8c0d04..0000000000 --- a/dimos/web/voice_web_interface.py +++ /dev/null @@ -1,228 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import io -import time -import logging -from pathlib import Path -from typing import Optional - -import numpy as np -import ffmpeg -import soundfile as sf -import reactivex as rx -from fastapi import FastAPI, UploadFile, File -from fastapi.responses import HTMLResponse, JSONResponse -from fastapi.middleware.cors import CORSMiddleware -from reactivex.subject import Subject -from reactivex import operators as ops - -from dimos.web.edge_io import EdgeIO -from dimos.stream.audio.base import AudioEvent - -logger = logging.getLogger(__name__) -logging.basicConfig(level=logging.INFO) - - -class VoiceWebInterface(EdgeIO): - """A minimal FastAPI server that captures audio from the browser and exposes it - as a ReactiveX ``Observable`` of :class:`dimos.stream.audio.base.AudioEvent`. - - The browser side records audio using the MediaRecorder API and sends it as a - single *webm* blob to the ``/upload_audio`` endpoint when the user stops - recording. The server converts the blob to mono 16-kHz *wav* using - *ffmpeg*, loads it into a NumPy array with *soundfile*, wraps it in an - :class:`AudioEvent`, and finally pushes it to an internal - ``Subject``. Down-stream components such as the Whisper STT node can simply - subscribe to :pyattr:`audio_stream` or call :pyfunc:`emit_audio` to receive - the audio events. - """ - - def __init__(self, host: str = "0.0.0.0", port: int = 5560): - super().__init__(dev_name="Voice Web Interface", edge_type="Input") - - self.host = host - self.port = port - - # Reactive stream for audio events - self._audio_subject: Subject = Subject() - # Shared observable so multiple subscribers receive the same events - self.audio_stream = self._audio_subject.pipe(ops.share()) - - # FastAPI app & CORS for local development - self.app = FastAPI() - self.app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], - ) - - self._setup_routes() - - # ------------------------------------------------------------------ - # Public helpers - # ------------------------------------------------------------------ - - def emit_audio(self): - """Return the shared audio observable.""" - return self.audio_stream - - # ------------------------------------------------------------------ - # FastAPI routes - # ------------------------------------------------------------------ - - def _setup_routes(self): - @self.app.get("/", response_class=HTMLResponse) - async def index(): # noqa: D401 – simple page - """Return minimal HTML that records and uploads audio.""" - return HTMLResponse(content=self._index_html(), status_code=200) - - @self.app.post("/upload_audio") - async def upload_audio(file: UploadFile = File(...)): - try: - data = await file.read() - audio_np, sr = self._decode_audio(data) - if audio_np is None: - return JSONResponse( - status_code=400, - content={"success": False, "message": "Unable to decode audio"}, - ) - - event = AudioEvent( - data=audio_np, - sample_rate=sr, - timestamp=time.time(), - channels=1 if audio_np.ndim == 1 else audio_np.shape[1], - ) - - # Push to reactive stream - self._audio_subject.on_next(event) - logger.info("Received audio – %.2f s, %d Hz", event.data.shape[0] / sr, sr) - return {"success": True} - except Exception as e: # pragma: no cover – runtime safety - logger.exception("Failed to process uploaded audio: %s", e) - return JSONResponse(status_code=500, content={"success": False, "message": str(e)}) - - # ------------------------------------------------------------------ - # Internal helpers - # ------------------------------------------------------------------ - - @staticmethod - def _decode_audio(raw: bytes) -> tuple[Optional[np.ndarray], Optional[int]]: - """Convert the *webm/opus* blob sent by the browser into mono 16-kHz PCM. - - Returns (audio, sample_rate) or (None, None) on failure. - """ - try: - # Use ffmpeg to convert to 16-kHz mono 16-bit PCM WAV in memory - out, _ = ( - ffmpeg.input("pipe:0") - .output( - "pipe:1", - format="wav", - acodec="pcm_s16le", - ac=1, - ar="16000", - loglevel="quiet", - ) - .run(input=raw, capture_stdout=True, capture_stderr=True) - ) - # Load with soundfile (returns float32 by default) - audio, sr = sf.read(io.BytesIO(out), dtype="float32") - # Ensure 1-D array (mono) - if audio.ndim > 1: - audio = audio[:, 0] - return np.array(audio), sr - except Exception as exc: - logger.error("ffmpeg decoding failed: %s", exc) - return None, None - - @staticmethod - def _index_html() -> str: - """Return HTML/JS for the voice interface.""" - return """ - - - - - - Voice Command Interface - - - -

Voice Command Interface

- -
- - - - -""" - - # ------------------------------------------------------------------ - # Server runner - # ------------------------------------------------------------------ - - def run(self): - """Run the FastAPI application using Uvicorn.""" - import uvicorn - - uvicorn.run(self.app, host=self.host, port=self.port)