diff --git a/dimos/agents2/cli/web.py b/dimos/agents2/cli/web.py new file mode 100644 index 0000000000..cb589ac5ce --- /dev/null +++ b/dimos/agents2/cli/web.py @@ -0,0 +1,87 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from threading import Thread +from typing import TYPE_CHECKING + +import reactivex as rx +import reactivex.operators as ops + +from dimos.core import Module, rpc +from dimos.core.transport import pLCMTransport +from dimos.stream.audio.node_normalizer import AudioNormalizer +from dimos.stream.audio.stt.node_whisper import WhisperNode +from dimos.utils.logging_config import setup_logger +from dimos.web.robot_web_interface import RobotWebInterface + +if TYPE_CHECKING: + from dimos.stream.audio.base import AudioEvent + +logger = setup_logger(__name__) + + +class WebInput(Module): + _web_interface: RobotWebInterface | None = None + _thread: Thread | None = None + _human_transport: pLCMTransport[str] | None = None + + @rpc + def start(self) -> None: + super().start() + + self._human_transport = pLCMTransport("/human_input") + + audio_subject: rx.subject.Subject[AudioEvent] = rx.subject.Subject() + + self._web_interface = RobotWebInterface( + port=5555, + text_streams={"agent_responses": rx.subject.Subject()}, + audio_subject=audio_subject, + ) + + normalizer = AudioNormalizer() + stt_node = WhisperNode() + + # Connect audio pipeline: browser audio → normalizer → whisper + normalizer.consume_audio(audio_subject.pipe(ops.share())) + stt_node.consume_audio(normalizer.emit_audio()) + + # Subscribe to both text input sources + # 1. Direct text from web interface + unsub = self._web_interface.query_stream.subscribe(self._human_transport.publish) + self._disposables.add(unsub) + + # 2. Transcribed text from STT + unsub = stt_node.emit_text().subscribe(self._human_transport.publish) + self._disposables.add(unsub) + + self._thread = Thread(target=self._web_interface.run, daemon=True) + self._thread.start() + + logger.info("Web interface started at http://localhost:5555") + + @rpc + def stop(self) -> None: + if self._web_interface: + self._web_interface.shutdown() + if self._thread: + self._thread.join(timeout=1.0) + if self._human_transport: + self._human_transport.lcm.stop() + super().stop() + + +web_input = WebInput.blueprint + +__all__ = ["WebInput", "web_input"] diff --git a/dimos/agents2/skills/speak_skill.py b/dimos/agents2/skills/speak_skill.py new file mode 100644 index 0000000000..0623824266 --- /dev/null +++ b/dimos/agents2/skills/speak_skill.py @@ -0,0 +1,104 @@ +# Copyright 2025 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import threading +import time + +from reactivex import Subject + +from dimos.core.core import rpc +from dimos.core.skill_module import SkillModule +from dimos.protocol.skill.skill import skill +from dimos.stream.audio.node_output import SounddeviceAudioOutput +from dimos.stream.audio.tts.node_openai import OpenAITTSNode, Voice +from dimos.utils.logging_config import setup_logger + +logger = setup_logger("dimos.agents2.skills.speak_skill") + + +class SpeakSkill(SkillModule): + _tts_node: OpenAITTSNode | None = None + _audio_output: SounddeviceAudioOutput | None = None + _audio_lock: threading.Lock = threading.Lock() + + @rpc + def start(self) -> None: + super().start() + self._tts_node = OpenAITTSNode(speed=1.2, voice=Voice.ONYX) + self._audio_output = SounddeviceAudioOutput(sample_rate=24000) + self._audio_output.consume_audio(self._tts_node.emit_audio()) + + @rpc + def stop(self) -> None: + if self._tts_node: + self._tts_node.dispose() + self._tts_node = None + if self._audio_output: + self._audio_output.stop() + self._audio_output = None + super().stop() + + @skill() + def speak(self, text: str) -> str: + """Speak text out loud through the robot's speakers. + + USE THIS TOOL AS OFTEN AS NEEDED. People can't normally see what you say in text, but can hear what you speak. + + Try to be as concise as possible. Remember that speaking takes time, so get to the point quickly. + + Example usage: + + speak("Hello, I am your robot assistant.") + """ + if self._tts_node is None: + return "Error: TTS not initialized" + + # Use lock to prevent simultaneous speech + with self._audio_lock: + text_subject: Subject[str] = Subject() + audio_complete = threading.Event() + self._tts_node.consume_text(text_subject) + + def set_as_complete(_t: str) -> None: + audio_complete.set() + + def set_as_complete_e(_e: Exception) -> None: + audio_complete.set() + + subscription = self._tts_node.emit_text().subscribe( + on_next=set_as_complete, + on_error=set_as_complete_e, + ) + + text_subject.on_next(text) + text_subject.on_completed() + + timeout = max(5, len(text) * 0.1) + + if not audio_complete.wait(timeout=timeout): + logger.warning(f"TTS timeout reached for: {text}") + subscription.dispose() + return f"Warning: TTS timeout while speaking: {text}" + else: + # Small delay to ensure buffers flush + time.sleep(0.3) + + subscription.dispose() + + return f"Spoke: {text}" + + +speak_skill = SpeakSkill.blueprint + +__all__ = ["SpeakSkill", "speak_skill"] diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index 997cbb324d..6e2f0c656e 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -66,10 +66,12 @@ "osm_skill": "dimos.agents2.skills.osm", "ros_nav": "dimos.navigation.rosnav", "spatial_memory": "dimos.perception.spatial_perception", + "speak_skill": "dimos.agents2.skills.speak_skill", "unitree_skills": "dimos.robot.unitree_webrtc.unitree_skill_container", "utilization": "dimos.utils.monitoring", "wavefront_frontier_explorer": "dimos.navigation.frontier_exploration.wavefront_frontier_goal_selector", "websocket_vis": "dimos.web.websocket_vis.websocket_vis_module", + "web_input": "dimos.agents2.cli.web", } diff --git a/dimos/robot/unitree_webrtc/unitree_g1_blueprints.py b/dimos/robot/unitree_webrtc/unitree_g1_blueprints.py index bcbf8c4c6a..f7041400b4 100644 --- a/dimos/robot/unitree_webrtc/unitree_g1_blueprints.py +++ b/dimos/robot/unitree_webrtc/unitree_g1_blueprints.py @@ -42,7 +42,7 @@ from dimos.msgs.nav_msgs import Odometry, Path from dimos.msgs.sensor_msgs import Image, PointCloud2 from dimos.msgs.std_msgs import Bool -from dimos.msgs.vision_msgs import Detection2DArray, Detection3DArray +from dimos.msgs.vision_msgs import Detection2DArray from dimos.navigation.bt_navigator.navigator import ( behavior_tree_navigator, ) diff --git a/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py b/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py index e1383ae0f5..e6bd615870 100644 --- a/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py +++ b/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py @@ -18,8 +18,10 @@ from dimos.agents2.agent import llm_agent from dimos.agents2.cli.human import human_input +from dimos.agents2.cli.web import web_input from dimos.agents2.ollama_agent import ollama_installed from dimos.agents2.skills.navigation import navigation_skill +from dimos.agents2.skills.speak_skill import speak_skill from dimos.agents2.spec import Provider from dimos.constants import DEFAULT_CAPACITY_COLOR_IMAGE from dimos.core.blueprints import autoconnect @@ -113,6 +115,8 @@ human_input(), navigation_skill(), unitree_skills(), + web_input(), + speak_skill(), ) agentic = autoconnect( diff --git a/dimos/robot/unitree_webrtc/unitree_skill_container.py b/dimos/robot/unitree_webrtc/unitree_skill_container.py index 5bb196d028..e346849c1f 100644 --- a/dimos/robot/unitree_webrtc/unitree_skill_container.py +++ b/dimos/robot/unitree_webrtc/unitree_skill_container.py @@ -105,11 +105,6 @@ def current_time(self): # type: ignore[no-untyped-def] yield str(datetime.datetime.now()) time.sleep(1) - @skill() - def speak(self, text: str) -> str: - """Speak text out loud through the robot's speakers.""" - return f"This is being said aloud: {text}" - @skill() def execute_sport_command(self, command_name: str) -> str: if self._publish_request is None: diff --git a/dimos/web/dimos_interface/api/server.py b/dimos/web/dimos_interface/api/server.py index 5db477dbca..faa787851e 100644 --- a/dimos/web/dimos_interface/api/server.py +++ b/dimos/web/dimos_interface/api/server.py @@ -68,6 +68,7 @@ def __init__( # type: ignore[no-untyped-def] print("Starting FastAPIServer initialization...") # Debug print super().__init__(dev_name, edge_type) self.app = FastAPI() + self._server: uvicorn.Server | None = None # Add CORS middleware with more permissive settings for development self.app.add_middleware( @@ -353,10 +354,18 @@ async def text_stream(key: str): # type: ignore[no-untyped-def] self.app.get(f"/video_feed/{key}")(self.create_video_feed_route(key)) # type: ignore[no-untyped-call] def run(self) -> None: - """Run the FastAPI server.""" - uvicorn.run( - self.app, host=self.host, port=self.port - ) # TODO: Translate structure to enable in-built workers' + config = uvicorn.Config( + self.app, + host=self.host, + port=self.port, + log_level="error", # Reduce verbosity + ) + self._server = uvicorn.Server(config) + self._server.run() + + def shutdown(self) -> None: + if self._server is not None: + self._server.should_exit = True if __name__ == "__main__":