diff --git a/assets/agent/prompt.txt b/assets/agent/prompt.txt index 8a2ccff020..b3d159a7df 100644 --- a/assets/agent/prompt.txt +++ b/assets/agent/prompt.txt @@ -64,7 +64,13 @@ Saved Robot Locations: ***ALWAYS CHECK FIRST if you can find a navigation query in the Saved Robot Locations before running the NavigateWithText tool call. If a saved location is found, get there with NavigateToGoal.*** -***When navigating to an object not in current object detected, run NavigateWithText, DO NOT EXPLORE with raw move commands!!! +***Don't use object detections for navigating to an object, ALWAYS run NavigateWithText. Only use object detections if NavigateWithText fails*** + +***When running NavigateWithText, set skip_visual_search flag to TRUE if the query is a general location such as kitchen or office, if it fails, then run without this flag*** + +***When navigating to an object not in current object detected, run NavigateWithText, DO NOT EXPLORE with raw move commands!!!*** + +***The object detection list is not a comprehensive source of information, when given a visual query like "go to the person wearing a hat" or "Do you see a dog", always Prioritize running observe skill and NavigateWithText*** PLANNING & REASONING: - You can develop both short-term and long-term plans to achieve complex goals diff --git a/dimos/agents/claude_agent.py b/dimos/agents/claude_agent.py index 53148d17b3..e87b1f47b4 100644 --- a/dimos/agents/claude_agent.py +++ b/dimos/agents/claude_agent.py @@ -624,8 +624,11 @@ def _observable_query( observer.on_completed() except Exception as e: logger.error(f"Query failed in {self.dev_name}: {e}") - observer.on_error(e) - self.response_subject.on_error(e) + # Send a user-friendly error message instead of propagating the error + error_message = "I apologize, but I'm having trouble processing your request right now. Please try again." + observer.on_next(error_message) + self.response_subject.on_next(error_message) + observer.on_completed() def _handle_tooling(self, response_message, messages): """Executes tools and appends tool-use/result blocks to messages.""" @@ -649,12 +652,36 @@ def _handle_tooling(self, response_message, messages): } messages.append({"role": "assistant", "content": [tool_use_block]}) - # Execute the tool - args = json.loads(tool_call.function.arguments) - tool_result = self.skills.call(tool_call.function.name, **args) - - # Add tool result to conversation history - if tool_result: + try: + # Execute the tool + args = json.loads(tool_call.function.arguments) + tool_result = self.skills.call(tool_call.function.name, **args) + + # Check if the result is an error message + if isinstance(tool_result, str) and ( + "Error executing skill" in tool_result or "is not available" in tool_result + ): + # Log the error but provide a user-friendly message + logger.error(f"Tool execution failed: {tool_result}") + tool_result = "I apologize, but I'm having trouble executing that action right now. Please try again or ask for something else." + + # Add tool result to conversation history + if tool_result: + messages.append( + { + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": tool_call.id, + "content": f"{tool_result}", + } + ], + } + ) + except Exception as e: + logger.error(f"Unexpected error executing tool {tool_call.function.name}: {e}") + # Add error result to conversation history messages.append( { "role": "user", @@ -662,7 +689,7 @@ def _handle_tooling(self, response_message, messages): { "type": "tool_result", "tool_use_id": tool_call.id, - "content": f"{tool_result}", + "content": "I apologize, but I encountered an error while trying to execute that action. Please try again.", } ], } @@ -672,13 +699,19 @@ def _tooling_callback(self, response_message): """Runs the observable query for each tool call in the current response_message""" if not hasattr(response_message, "tool_calls") or not response_message.tool_calls: return - for tool_call in response_message.tool_calls: - tool_name = tool_call.function.name - tool_id = tool_call.id - self.run_observable_query( - query_text=f"Tool {tool_name}, ID: {tool_id} execution complete. Please summarize the results and continue.", - thinking_budget_tokens=0, - ).run() + + try: + for tool_call in response_message.tool_calls: + tool_name = tool_call.function.name + tool_id = tool_call.id + self.run_observable_query( + query_text=f"Tool {tool_name}, ID: {tool_id} execution complete. Please summarize the results and continue.", + thinking_budget_tokens=0, + ).run() + except Exception as e: + logger.error(f"Error in tooling callback: {e}") + # Continue processing even if the callback fails + pass def _debug_api_call(self, claude_params: dict): """Debugging function to log API calls with truncated base64 data.""" diff --git a/dimos/models/qwen/video_query.py b/dimos/models/qwen/video_query.py index 3471e52268..7eda5f1aed 100644 --- a/dimos/models/qwen/video_query.py +++ b/dimos/models/qwen/video_query.py @@ -128,7 +128,7 @@ def query_single_frame( openai_client=qwen_client, model_name=model_name, tokenizer=HuggingFaceTokenizer(model_name=f"Qwen/{model_name}"), - max_output_tokens_per_request=100, + max_output_tokens_per_request=8192, system_query=query, pool_scheduler=get_scheduler(), ) diff --git a/dimos/perception/object_detection_stream.py b/dimos/perception/object_detection_stream.py index 552a31acfe..3284d99f8b 100644 --- a/dimos/perception/object_detection_stream.py +++ b/dimos/perception/object_detection_stream.py @@ -205,7 +205,7 @@ def process_frame(frame): # position and rotation are already Vector objects, no need to convert robot_pose = self.get_pose() position, rotation = transform_robot_to_map( - robot_pose, position, rotation + robot_pose["position"], robot_pose["rotation"], position, rotation ) except Exception as e: logger.error(f"Error transforming to map frame: {e}") diff --git a/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py b/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py index 5f9032aa28..76f2ddbb0a 100644 --- a/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py +++ b/dimos/robot/frontier_exploration/wavefront_frontier_goal_selector.py @@ -83,7 +83,7 @@ def __init__( self, min_frontier_size: int = 10, occupancy_threshold: int = 65, - subsample_resolution: int = 3, + subsample_resolution: int = 2, min_distance_from_robot: float = 0.5, explored_area_buffer: float = 0.5, min_distance_from_obstacles: float = 0.6, diff --git a/dimos/robot/unitree/unitree_skills.py b/dimos/robot/unitree/unitree_skills.py index d191753294..5029123ed1 100644 --- a/dimos/robot/unitree/unitree_skills.py +++ b/dimos/robot/unitree/unitree_skills.py @@ -51,34 +51,34 @@ 1006, "Recovers the robot to a state from which it can take more commands. Useful to run after multiple dynamic commands like front flips.", ), - ( - "Euler", - 1007, - "Adjusts the robot's orientation using Euler angles, providing precise control over its rotation.", - ), + # ( + # "Euler", + # 1007, + # "Adjusts the robot's orientation using Euler angles, providing precise control over its rotation.", + # ), # ("Move", 1008, "Move the robot using velocity commands."), # Intentionally omitted ("Sit", 1009, "Commands the robot to sit down from a standing or moving stance."), - ( - "RiseSit", - 1010, - "Commands the robot to rise back to a standing position from a sitting posture.", - ), - ( - "SwitchGait", - 1011, - "Switches the robot's walking pattern or style dynamically, suitable for different terrains or speeds.", - ), - ("Trigger", 1012, "Triggers a specific action or custom routine programmed into the robot."), - ( - "BodyHeight", - 1013, - "Adjusts the height of the robot's body from the ground, useful for navigating various obstacles.", - ), - ( - "FootRaiseHeight", - 1014, - "Controls how high the robot lifts its feet during movement, which can be adjusted for different surfaces.", - ), + # ( + # "RiseSit", + # 1010, + # "Commands the robot to rise back to a standing position from a sitting posture.", + # ), + # ( + # "SwitchGait", + # 1011, + # "Switches the robot's walking pattern or style dynamically, suitable for different terrains or speeds.", + # ), + # ("Trigger", 1012, "Triggers a specific action or custom routine programmed into the robot."), + # ( + # "BodyHeight", + # 1013, + # "Adjusts the height of the robot's body from the ground, useful for navigating various obstacles.", + # ), + # ( + # "FootRaiseHeight", + # 1014, + # "Controls how high the robot lifts its feet during movement, which can be adjusted for different surfaces.", + # ), ( "SpeedLevel", 1015, @@ -90,16 +90,16 @@ "Performs a greeting action, which could involve a wave or other friendly gesture.", ), ("Stretch", 1017, "Engages the robot in a stretching routine."), - ( - "TrajectoryFollow", - 1018, - "Directs the robot to follow a predefined trajectory, which could involve complex paths or maneuvers.", - ), - ( - "ContinuousGait", - 1019, - "Enables a mode for continuous walking or running, ideal for long-distance travel.", - ), + # ( + # "TrajectoryFollow", + # 1018, + # "Directs the robot to follow a predefined trajectory, which could involve complex paths or maneuvers.", + # ), + # ( + # "ContinuousGait", + # 1019, + # "Enables a mode for continuous walking or running, ideal for long-distance travel.", + # ), ("Content", 1020, "To display or trigger when the robot is happy."), ("Wallow", 1021, "The robot falls onto its back and rolls around."), ( @@ -108,18 +108,18 @@ "Performs a predefined dance routine 1, programmed for entertainment or demonstration.", ), ("Dance2", 1023, "Performs another variant of a predefined dance routine 2."), - ("GetBodyHeight", 1024, "Retrieves the current height of the robot's body from the ground."), - ( - "GetFootRaiseHeight", - 1025, - "Retrieves the current height at which the robot's feet are being raised during movement.", - ), - ("GetSpeedLevel", 1026, "Returns the current speed level at which the robot is operating."), - ( - "SwitchJoystick", - 1027, - "Toggles the control mode to joystick input, allowing for manual direction of the robot's movements.", - ), + # ("GetBodyHeight", 1024, "Retrieves the current height of the robot's body from the ground."), + # ( + # "GetFootRaiseHeight", + # 1025, + # "Retrieves the current height at which the robot's feet are being raised during movement.", + # ), + # ("GetSpeedLevel", 1026, "Returns the current speed level at which the robot is operating."), + # ( + # "SwitchJoystick", + # 1027, + # "Toggles the control mode to joystick input, allowing for manual direction of the robot's movements.", + # ), ( "Pose", 1028, @@ -137,46 +137,46 @@ 1032, "Initiates a pouncing movement forward, mimicking animal-like pouncing behavior.", ), - ("WiggleHips", 1033, "Causes the robot to wiggle its hips."), - ( - "GetState", - 1034, - "Retrieves the current operational state of the robot, including status reports or diagnostic information.", - ), - ( - "EconomicGait", - 1035, - "Engages a more energy-efficient walking or running mode to conserve battery life.", - ), - ("FingerHeart", 1036, "Performs a finger heart gesture while on its hind legs."), - ( - "Handstand", - 1301, - "Commands the robot to perform a handstand, demonstrating balance and control.", - ), - ( - "CrossStep", - 1302, - "Engages the robot in a cross-stepping routine, useful for complex locomotion or dance moves.", - ), - ( - "OnesidedStep", - 1303, - "Commands the robot to perform a stepping motion that predominantly uses one side.", - ), - ( - "Bound", - 1304, - "Initiates a bounding motion, similar to a light, repetitive hopping or leaping.", - ), - ( - "LeadFollow", - 1045, - "Engages follow-the-leader behavior, where the robot follows a designated leader or follows a signal.", - ), - ("LeftFlip", 1042, "Executes a flip towards the left side."), - ("RightFlip", 1043, "Performs a flip towards the right side."), - ("Backflip", 1044, "Executes a backflip, a complex and dynamic maneuver."), + # ("WiggleHips", 1033, "Causes the robot to wiggle its hips."), + # ( + # "GetState", + # 1034, + # "Retrieves the current operational state of the robot, including status reports or diagnostic information.", + # ), + # ( + # "EconomicGait", + # 1035, + # "Engages a more energy-efficient walking or running mode to conserve battery life.", + # ), + # ("FingerHeart", 1036, "Performs a finger heart gesture while on its hind legs."), + # ( + # "Handstand", + # 1301, + # "Commands the robot to perform a handstand, demonstrating balance and control.", + # ), + # ( + # "CrossStep", + # 1302, + # "Engages the robot in a cross-stepping routine, useful for complex locomotion or dance moves.", + # ), + # ( + # "OnesidedStep", + # 1303, + # "Commands the robot to perform a stepping motion that predominantly uses one side.", + # ), + # ( + # "Bound", + # 1304, + # "Initiates a bounding motion, similar to a light, repetitive hopping or leaping.", + # ), + # ( + # "LeadFollow", + # 1045, + # "Engages follow-the-leader behavior, where the robot follows a designated leader or follows a signal.", + # ), + # ("LeftFlip", 1042, "Executes a flip towards the left side."), + # ("RightFlip", 1043, "Performs a flip towards the right side."), + # ("Backflip", 1044, "Executes a backflip, a complex and dynamic maneuver."), ] # region MyUnitreeSkills diff --git a/dimos/robot/unitree_webrtc/unitree_skills.py b/dimos/robot/unitree_webrtc/unitree_skills.py index e6bc4ec759..f9dfc1eede 100644 --- a/dimos/robot/unitree_webrtc/unitree_skills.py +++ b/dimos/robot/unitree_webrtc/unitree_skills.py @@ -50,14 +50,8 @@ ( "RecoveryStand", 1006, - "Recovers the robot to a state from which it can take more commands. Useful to run after multiple dynamic commands like front flips.", + "Recovers the robot to a state from which it can take more commands. Useful to run after multiple dynamic commands like front flips, Must run after skills like sit and jump and standup.", ), - ( - "Euler", - 1007, - "Adjusts the robot's orientation using Euler angles, providing precise control over its rotation.", - ), - # ("Move", 1008, "Move the robot using velocity commands."), # Handled separately ("Sit", 1009, "Commands the robot to sit down from a standing or moving stance."), ( "RiseSit", diff --git a/dimos/skills/navigation.py b/dimos/skills/navigation.py index 114e3dacb8..6d67ae04f2 100644 --- a/dimos/skills/navigation.py +++ b/dimos/skills/navigation.py @@ -60,18 +60,24 @@ class NavigateWithText(AbstractRobotSkill): This skill first attempts to locate an object in the robot's camera view using vision. If the object is found, it navigates to it. If not, it falls back to querying the - semantic map for a location matching the description. For example, "Find the kitchen" - will first look for a kitchen in view, then check the semantic map coordinates where - a kitchen was previously observed. + semantic map for a location matching the description. For example, "Find the Teddy Bear" + will first look for a Teddy Bear in view, then check the semantic map coordinates where + a Teddy Bear was previously observed. CALL THIS SKILL FOR ONE SUBJECT AT A TIME. For example: "Go to the person wearing a blue shirt in the living room", you should call this skill twice, once for the person wearing a blue shirt and once for the living room. + + If skip_visual_search is True, this skill will skip the visual search for the object in view. + This is useful if you want to navigate to a general location such as a kitchen or office. + For example, "Go to the kitchen" will not look for a kitchen in view, but will check the semantic map coordinates where + a kitchen was previously observed. """ query: str = Field("", description="Text query to search for in the semantic map") limit: int = Field(1, description="Maximum number of results to return") distance: float = Field(1.0, description="Desired distance to maintain from object in meters") + skip_visual_search: bool = Field(False, description="Skip visual search for object in view") timeout: float = Field(40.0, description="Maximum time to spend navigating in seconds") def __init__(self, robot=None, **data): @@ -362,22 +368,24 @@ def __call__(self): # First, try to find and navigate to the object in camera view logger.info(f"First attempting to find and navigate to visible object: '{self.query}'") - object_result = self._navigate_to_object() - if object_result and object_result["success"]: - logger.info(f"Successfully navigated to {self.query} in view") - return object_result + if not self.skip_visual_search: + object_result = self._navigate_to_object() - elif object_result and object_result["failure_reason"] == "Navigation": + if object_result and object_result["success"]: + logger.info(f"Successfully navigated to {self.query} in view") + return object_result + + elif object_result and object_result["failure_reason"] == "Navigation": + logger.info( + f"Failed to navigate to {self.query} in view: {object_result.get('error', 'Unknown error')}" + ) + return object_result + + # If object navigation failed, fall back to semantic map logger.info( - f"Failed to navigate to {self.query} in view: {object_result.get('error', 'Unknown error')}" + f"Object not found in view. Falling back to semantic map query for: '{self.query}'" ) - return object_result - - # If object navigation failed, fall back to semantic map - logger.info( - f"Object not found in view. Falling back to semantic map query for: '{self.query}'" - ) return self._navigate_using_semantic_map() @@ -405,12 +413,6 @@ def stop(self): logger.error(f"Error disposing navigation task: {e}") self._navigation_disposable = None - # Clean up spatial memory if it exists - if hasattr(self, "_spatial_memory") and self._spatial_memory is not None: - logger.info("Cleaning up spatial memory") - self._spatial_memory.cleanup() - self._spatial_memory = None - return "Navigate skill stopped successfully." diff --git a/dimos/skills/observe.py b/dimos/skills/observe.py index 844df11805..067307353a 100644 --- a/dimos/skills/observe.py +++ b/dimos/skills/observe.py @@ -106,14 +106,8 @@ def __call__(self): # Process the frame with Qwen response = self._process_frame_with_qwen(frame) - # Add the response to the conversation history - # self._agent.append_to_history( - # f"Observation: {response}", - # ) - response = self._agent.run_observable_query(f"Observation: {response}") - logger.info(f"Added Qwen observation to conversation history") - return f"Observation complete: {response[:100]}..." + return f"Observation complete: {response}" except Exception as e: error_msg = f"Error in Observe skill: {e}" @@ -127,6 +121,10 @@ def _get_frame_from_stream(self): Returns: A single frame from the video stream, or None if no frame is available """ + if self._video_stream is None: + logger.error("Video stream is None") + return None + frame = None frame_subject = rx.subject.Subject() diff --git a/dimos/skills/skills.py b/dimos/skills/skills.py index 559f9b0e36..f6c7456d24 100644 --- a/dimos/skills/skills.py +++ b/dimos/skills/skills.py @@ -121,28 +121,35 @@ def create_instance(self, name, **kwargs): print(f"Stored args for later instance creation: {name} with args: {kwargs}") def call(self, name, **args): - # Get the stored args if available; otherwise, use an empty dict - stored_args = self._instances.get(name, {}) - - # Merge the arguments with priority given to stored arguments - complete_args = {**args, **stored_args} - - # Dynamically get the class from the module or current script - skill_class = getattr(self, name, None) - if skill_class is None: - for skill in self.get(): - if name == skill.__name__: - skill_class = skill - break - if skill_class is None: - raise ValueError(f"Skill class not found: {name}") + try: + # Get the stored args if available; otherwise, use an empty dict + stored_args = self._instances.get(name, {}) - # Initialize the instance with the merged arguments - instance = skill_class(**complete_args) - print(f"Instance created and function called for: {name} with args: {complete_args}") + # Merge the arguments with priority given to stored arguments + complete_args = {**args, **stored_args} - # Call the instance directly - return instance() + # Dynamically get the class from the module or current script + skill_class = getattr(self, name, None) + if skill_class is None: + for skill in self.get(): + if name == skill.__name__: + skill_class = skill + break + if skill_class is None: + error_msg = f"Skill '{name}' is not available. Please check if it's properly registered." + logger.error(f"Skill class not found: {name}") + return error_msg + + # Initialize the instance with the merged arguments + instance = skill_class(**complete_args) + print(f"Instance created and function called for: {name} with args: {complete_args}") + + # Call the instance directly + return instance() + except Exception as e: + error_msg = f"Error executing skill '{name}': {str(e)}" + logger.error(error_msg) + return error_msg # ==== Tools ==== diff --git a/dimos/skills/unitree/__init__.py b/dimos/skills/unitree/__init__.py new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/dimos/skills/unitree/__init__.py @@ -0,0 +1 @@ + diff --git a/dimos/skills/unitree/unitree_speak.py b/dimos/skills/unitree/unitree_speak.py new file mode 100644 index 0000000000..05004398f9 --- /dev/null +++ b/dimos/skills/unitree/unitree_speak.py @@ -0,0 +1,280 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dimos.skills.skills import AbstractRobotSkill +from pydantic import Field +import time +import tempfile +import os +import json +import base64 +import hashlib +import soundfile as sf +import numpy as np +from openai import OpenAI +from dimos.utils.logging_config import setup_logger +from go2_webrtc_driver.constants import RTC_TOPIC + +logger = setup_logger("dimos.skills.unitree.unitree_speak") + +# Audio API constants (from go2_webrtc_driver) +AUDIO_API = { + "GET_AUDIO_LIST": 1001, + "SELECT_START_PLAY": 1002, + "PAUSE": 1003, + "UNSUSPEND": 1004, + "SET_PLAY_MODE": 1007, + "UPLOAD_AUDIO_FILE": 2001, + "ENTER_MEGAPHONE": 4001, + "EXIT_MEGAPHONE": 4002, + "UPLOAD_MEGAPHONE": 4003, +} + +PLAY_MODES = {"NO_CYCLE": "no_cycle", "SINGLE_CYCLE": "single_cycle", "LIST_LOOP": "list_loop"} + + +class UnitreeSpeak(AbstractRobotSkill): + """Speak text out loud through the robot's speakers using WebRTC audio upload.""" + + text: str = Field(..., description="Text to speak") + voice: str = Field( + default="echo", description="Voice to use (alloy, echo, fable, onyx, nova, shimmer)" + ) + speed: float = Field(default=1.2, description="Speech speed (0.25 to 4.0)") + use_megaphone: bool = Field( + default=False, description="Use megaphone mode for lower latency (experimental)" + ) + + def __init__(self, **data): + super().__init__(**data) + self._openai_client = None + + def _get_openai_client(self): + if self._openai_client is None: + self._openai_client = OpenAI() + return self._openai_client + + def _generate_audio(self, text: str) -> bytes: + try: + client = self._get_openai_client() + response = client.audio.speech.create( + model="tts-1", voice=self.voice, input=text, speed=self.speed, response_format="mp3" + ) + return response.content + except Exception as e: + logger.error(f"Error generating audio: {e}") + raise + + def _webrtc_request(self, api_id: int, parameter: dict = None): + if parameter is None: + parameter = {} + + request_data = {"api_id": api_id, "parameter": json.dumps(parameter) if parameter else "{}"} + + return self._robot.webrtc_connection.publish_request( + RTC_TOPIC["AUDIO_HUB_REQ"], request_data + ) + + def _upload_audio_to_robot(self, audio_data: bytes, filename: str) -> str: + try: + file_md5 = hashlib.md5(audio_data).hexdigest() + b64_data = base64.b64encode(audio_data).decode("utf-8") + + chunk_size = 61440 + chunks = [b64_data[i : i + chunk_size] for i in range(0, len(b64_data), chunk_size)] + total_chunks = len(chunks) + + logger.info(f"Uploading audio '{filename}' in {total_chunks} chunks (optimized)") + + for i, chunk in enumerate(chunks, 1): + parameter = { + "file_name": filename, + "file_type": "wav", + "file_size": len(audio_data), + "current_block_index": i, + "total_block_number": total_chunks, + "block_content": chunk, + "current_block_size": len(chunk), + "file_md5": file_md5, + "create_time": int(time.time() * 1000), + } + + logger.debug(f"Sending chunk {i}/{total_chunks}") + response = self._webrtc_request(AUDIO_API["UPLOAD_AUDIO_FILE"], parameter) + + logger.info(f"Audio upload completed for '{filename}'") + + list_response = self._webrtc_request(AUDIO_API["GET_AUDIO_LIST"], {}) + + if list_response and "data" in list_response: + data_str = list_response.get("data", {}).get("data", "{}") + audio_list = json.loads(data_str).get("audio_list", []) + + for audio in audio_list: + if audio.get("CUSTOM_NAME") == filename: + return audio.get("UNIQUE_ID") + + logger.warning( + f"Could not find uploaded audio '{filename}' in list, using filename as UUID" + ) + return filename + + except Exception as e: + logger.error(f"Error uploading audio to robot: {e}") + raise + + def _play_audio_on_robot(self, uuid: str): + try: + self._webrtc_request(AUDIO_API["SET_PLAY_MODE"], {"play_mode": PLAY_MODES["NO_CYCLE"]}) + time.sleep(0.1) + + parameter = {"unique_id": uuid} + + logger.info(f"Playing audio with UUID: {uuid}") + self._webrtc_request(AUDIO_API["SELECT_START_PLAY"], parameter) + + except Exception as e: + logger.error(f"Error playing audio on robot: {e}") + raise + + def _stop_audio_playback(self): + try: + logger.debug("Stopping audio playback") + self._webrtc_request(AUDIO_API["PAUSE"], {}) + except Exception as e: + logger.warning(f"Error stopping audio playback: {e}") + + def _upload_and_play_megaphone(self, audio_data: bytes, duration: float): + try: + logger.debug("Entering megaphone mode") + self._webrtc_request(AUDIO_API["ENTER_MEGAPHONE"], {}) + + time.sleep(0.2) + + b64_data = base64.b64encode(audio_data).decode("utf-8") + + chunk_size = 4096 + chunks = [b64_data[i : i + chunk_size] for i in range(0, len(b64_data), chunk_size)] + total_chunks = len(chunks) + + logger.info(f"Uploading megaphone audio in {total_chunks} chunks") + + for i, chunk in enumerate(chunks, 1): + parameter = { + "current_block_size": len(chunk), + "block_content": chunk, + "current_block_index": i, + "total_block_number": total_chunks, + } + + logger.debug(f"Sending megaphone chunk {i}/{total_chunks}") + self._webrtc_request(AUDIO_API["UPLOAD_MEGAPHONE"], parameter) + + if i < total_chunks: + time.sleep(0.05) + + logger.info("Megaphone audio upload completed, waiting for playback") + + time.sleep(duration + 1.0) + + except Exception as e: + logger.error(f"Error in megaphone mode: {e}") + try: + self._webrtc_request(AUDIO_API["EXIT_MEGAPHONE"], {}) + except: + pass + raise + finally: + try: + logger.debug("Exiting megaphone mode") + self._webrtc_request(AUDIO_API["EXIT_MEGAPHONE"], {}) + time.sleep(0.1) + except Exception as e: + logger.warning(f"Error exiting megaphone mode: {e}") + + def __call__(self): + super().__call__() + + if not self._robot: + logger.error("No robot instance provided to UnitreeSpeak skill") + return "Error: No robot instance available" + + try: + display_text = self.text[:50] + "..." if len(self.text) > 50 else self.text + logger.info(f"Speaking: '{display_text}'") + + logger.debug("Generating audio with OpenAI TTS") + audio_data = self._generate_audio(self.text) + + with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp_mp3: + tmp_mp3.write(audio_data) + tmp_mp3_path = tmp_mp3.name + + try: + audio_array, sample_rate = sf.read(tmp_mp3_path) + + if audio_array.ndim > 1: + audio_array = np.mean(audio_array, axis=1) + + target_sample_rate = 22050 + if sample_rate != target_sample_rate: + logger.debug(f"Resampling from {sample_rate}Hz to {target_sample_rate}Hz") + old_length = len(audio_array) + new_length = int(old_length * target_sample_rate / sample_rate) + old_indices = np.arange(old_length) + new_indices = np.linspace(0, old_length - 1, new_length) + audio_array = np.interp(new_indices, old_indices, audio_array) + sample_rate = target_sample_rate + + audio_array = audio_array / np.max(np.abs(audio_array)) + + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_wav: + sf.write(tmp_wav.name, audio_array, sample_rate, format="WAV", subtype="PCM_16") + tmp_wav.seek(0) + wav_data = open(tmp_wav.name, "rb").read() + os.unlink(tmp_wav.name) + + logger.info( + f"Audio size: {len(wav_data) / 1024:.1f}KB, duration: {len(audio_array) / sample_rate:.1f}s" + ) + + finally: + os.unlink(tmp_mp3_path) + + if self.use_megaphone: + logger.debug("Using megaphone mode for lower latency") + duration = len(audio_array) / sample_rate + self._upload_and_play_megaphone(wav_data, duration) + + return f"Spoke: '{display_text}' on robot successfully (megaphone mode)" + else: + filename = f"speak_{int(time.time() * 1000)}" + + logger.debug("Uploading audio to robot") + uuid = self._upload_audio_to_robot(wav_data, filename) + + logger.debug("Playing audio on robot") + self._play_audio_on_robot(uuid) + + duration = len(audio_array) / sample_rate + logger.debug(f"Waiting {duration:.1f}s for playback to complete") + # time.sleep(duration + 0.2) + + # self._stop_audio_playback() + + return f"Spoke: '{display_text}' on robot successfully" + + except Exception as e: + logger.error(f"Error in speak skill: {e}") + return f"Error speaking text: {str(e)}" diff --git a/dimos/web/dimos_interface/api/server.py b/dimos/web/dimos_interface/api/server.py index 812973e300..bcc590ab46 100644 --- a/dimos/web/dimos_interface/api/server.py +++ b/dimos/web/dimos_interface/api/server.py @@ -27,7 +27,7 @@ # Fast Api & Uvicorn import cv2 from dimos.web.edge_io import EdgeIO -from fastapi import FastAPI, Request, Form, HTTPException +from fastapi import FastAPI, Request, Form, HTTPException, UploadFile, File from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse from sse_starlette.sse import EventSourceResponse from fastapi.templating import Jinja2Templates @@ -42,6 +42,14 @@ import reactivex as rx from fastapi.middleware.cors import CORSMiddleware +# For audio processing +import io +import time +import numpy as np +import ffmpeg +import soundfile as sf +from dimos.stream.audio.base import AudioEvent + # TODO: Resolve threading, start/stop stream functionality. @@ -53,6 +61,7 @@ def __init__( host="0.0.0.0", port=5555, text_streams=None, + audio_subject=None, **streams, ): print("Starting FastAPIServer initialization...") # Debug print @@ -88,6 +97,7 @@ def __init__( # Create a Subject for text queries self.query_subject = rx.subject.Subject() self.query_stream = self.query_subject.pipe(ops.share()) + self.audio_subject = audio_subject for key in self.streams: if self.streams[key] is not None: @@ -198,6 +208,33 @@ async def text_stream_generator(self, key): finally: self.text_clients.remove(client_id) + @staticmethod + def _decode_audio(raw: bytes) -> tuple[np.ndarray, int]: + """Convert the webm/opus blob sent by the browser into mono 16-kHz PCM.""" + try: + # Use ffmpeg to convert to 16-kHz mono 16-bit PCM WAV in memory + out, _ = ( + ffmpeg.input("pipe:0") + .output( + "pipe:1", + format="wav", + acodec="pcm_s16le", + ac=1, + ar="16000", + loglevel="quiet", + ) + .run(input=raw, capture_stdout=True, capture_stderr=True) + ) + # Load with soundfile (returns float32 by default) + audio, sr = sf.read(io.BytesIO(out), dtype="float32") + # Ensure 1-D array (mono) + if audio.ndim > 1: + audio = audio[:, 0] + return np.array(audio), sr + except Exception as exc: + print(f"ffmpeg decoding failed: {exc}") + return None, None + def setup_routes(self): """Set up FastAPI routes.""" @@ -221,6 +258,7 @@ async def index(request: Request): "request": request, "stream_keys": stream_keys, "text_stream_keys": text_stream_keys, + "has_voice": self.audio_subject is not None, }, ) @@ -240,6 +278,39 @@ async def submit_query(query: str = Form(...)): content={"success": False, "message": f"Server error: {str(e)}"}, ) + @self.app.post("/upload_audio") + async def upload_audio(file: UploadFile = File(...)): + """Handle audio upload from the browser.""" + if self.audio_subject is None: + return JSONResponse( + status_code=400, + content={"success": False, "message": "Voice input not configured"}, + ) + + try: + data = await file.read() + audio_np, sr = self._decode_audio(data) + if audio_np is None: + return JSONResponse( + status_code=400, + content={"success": False, "message": "Unable to decode audio"}, + ) + + event = AudioEvent( + data=audio_np, + sample_rate=sr, + timestamp=time.time(), + channels=1 if audio_np.ndim == 1 else audio_np.shape[1], + ) + + # Push to reactive stream + self.audio_subject.on_next(event) + print(f"Received audio – {event.data.shape[0] / sr:.2f} s, {sr} Hz") + return {"success": True} + except Exception as e: + print(f"Failed to process uploaded audio: {e}") + return JSONResponse(status_code=500, content={"success": False, "message": str(e)}) + # Unitree API endpoints @self.app.get("/unitree/status") async def unitree_status(): diff --git a/dimos/web/dimos_interface/api/templates/index_fastapi.html b/dimos/web/dimos_interface/api/templates/index_fastapi.html index 94cf0be328..406557c04a 100644 --- a/dimos/web/dimos_interface/api/templates/index_fastapi.html +++ b/dimos/web/dimos_interface/api/templates/index_fastapi.html @@ -130,6 +130,7 @@ .query-form { display: flex; gap: 10px; + align-items: center; } .query-input { @@ -155,6 +156,81 @@ background-color: #218838; } + /* Voice button styles */ + .voice-button { + width: 50px; + height: 50px; + border-radius: 50%; + background-color: #dc3545; + color: white; + border: none; + cursor: pointer; + display: flex; + align-items: center; + justify-content: center; + font-size: 24px; + transition: all 0.3s ease; + box-shadow: 0 2px 5px rgba(0,0,0,0.2); + position: relative; + } + + .voice-button:hover { + transform: scale(1.1); + box-shadow: 0 4px 8px rgba(0,0,0,0.3); + } + + .voice-button.recording { + background-color: #ff0000; + animation: pulse 1.5s infinite; + } + + .voice-button.recording::after { + content: ''; + position: absolute; + top: -10px; + left: -10px; + right: -10px; + bottom: -10px; + border: 3px solid rgba(255, 0, 0, 0.5); + border-radius: 50%; + animation: ripple 1.5s infinite; + } + + @keyframes pulse { + 0% { transform: scale(1); } + 50% { transform: scale(1.05); } + 100% { transform: scale(1); } + } + + @keyframes ripple { + 0% { + transform: scale(1); + opacity: 1; + } + 100% { + transform: scale(1.2); + opacity: 0; + } + } + + .voice-status { + position: absolute; + top: -25px; + left: 50%; + transform: translateX(-50%); + background-color: rgba(0,0,0,0.8); + color: white; + padding: 4px 8px; + border-radius: 4px; + font-size: 12px; + white-space: nowrap; + display: none; + } + + .voice-button.recording .voice-status { + display: block; + } + .query-response { margin-top: 15px; padding: 10px; @@ -218,6 +294,12 @@