dimensionalOS · paul-nechifor · Dec 2, 2025 · Nov 20, 2025 · Nov 20, 2025 · Nov 22, 2025
diff --git a/dimos/agents2/cli/web.py b/dimos/agents2/cli/web.py
@@ -0,0 +1,87 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from threading import Thread
+from typing import TYPE_CHECKING
+
+import reactivex as rx
+import reactivex.operators as ops
+
+from dimos.core import Module, rpc
+from dimos.core.transport import pLCMTransport
+from dimos.stream.audio.node_normalizer import AudioNormalizer
+from dimos.stream.audio.stt.node_whisper import WhisperNode
+from dimos.utils.logging_config import setup_logger
+from dimos.web.robot_web_interface import RobotWebInterface
+
+if TYPE_CHECKING:
+    from dimos.stream.audio.base import AudioEvent
+
+logger = setup_logger(__name__)
+
+
+class WebInput(Module):
+    _web_interface: RobotWebInterface | None = None
+    _thread: Thread | None = None
+    _human_transport: pLCMTransport[str] | None = None
+
+    @rpc
+    def start(self) -> None:
+        super().start()
+
+        self._human_transport = pLCMTransport("/human_input")
+
+        audio_subject: rx.subject.Subject[AudioEvent] = rx.subject.Subject()
+
+        self._web_interface = RobotWebInterface(
+            port=5555,
+            text_streams={"agent_responses": rx.subject.Subject()},
+            audio_subject=audio_subject,
+        )
+
+        normalizer = AudioNormalizer()
+        stt_node = WhisperNode()
+
+        # Connect audio pipeline: browser audio → normalizer → whisper
+        normalizer.consume_audio(audio_subject.pipe(ops.share()))
+        stt_node.consume_audio(normalizer.emit_audio())
+
+        # Subscribe to both text input sources
+        # 1. Direct text from web interface
+        unsub = self._web_interface.query_stream.subscribe(self._human_transport.publish)
+        self._disposables.add(unsub)
+
+        # 2. Transcribed text from STT
+        unsub = stt_node.emit_text().subscribe(self._human_transport.publish)
+        self._disposables.add(unsub)
+
+        self._thread = Thread(target=self._web_interface.run, daemon=True)
+        self._thread.start()
+
+        logger.info("Web interface started at http://localhost:5555")
+
+    @rpc
+    def stop(self) -> None:
+        if self._web_interface:
+            self._web_interface.shutdown()
+        if self._thread:
+            self._thread.join(timeout=1.0)
+        if self._human_transport:
+            self._human_transport.lcm.stop()
+        super().stop()
+
+
+web_input = WebInput.blueprint
+
+__all__ = ["WebInput", "web_input"]
diff --git a/dimos/agents2/skills/speak_skill.py b/dimos/agents2/skills/speak_skill.py
@@ -0,0 +1,104 @@
+# Copyright 2025 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import threading
+import time
+
+from reactivex import Subject
+
+from dimos.core.core import rpc
+from dimos.core.skill_module import SkillModule
+from dimos.protocol.skill.skill import skill
+from dimos.stream.audio.node_output import SounddeviceAudioOutput
+from dimos.stream.audio.tts.node_openai import OpenAITTSNode, Voice
+from dimos.utils.logging_config import setup_logger
+
+logger = setup_logger("dimos.agents2.skills.speak_skill")
+
+
+class SpeakSkill(SkillModule):
+    _tts_node: OpenAITTSNode | None = None
+    _audio_output: SounddeviceAudioOutput | None = None
+    _audio_lock: threading.Lock = threading.Lock()
+
+    @rpc
+    def start(self) -> None:
+        super().start()
+        self._tts_node = OpenAITTSNode(speed=1.2, voice=Voice.ONYX)
+        self._audio_output = SounddeviceAudioOutput(sample_rate=24000)
+        self._audio_output.consume_audio(self._tts_node.emit_audio())
+
+    @rpc
+    def stop(self) -> None:
+        if self._tts_node:
+            self._tts_node.dispose()
+            self._tts_node = None
+        if self._audio_output:
+            self._audio_output.stop()
+            self._audio_output = None
+        super().stop()
+
+    @skill()
+    def speak(self, text: str) -> str:
+        """Speak text out loud through the robot's speakers.
+
+        USE THIS TOOL AS OFTEN AS NEEDED. People can't normally see what you say in text, but can hear what you speak.
+
+        Try to be as concise as possible. Remember that speaking takes time, so get to the point quickly.
+
+        Example usage:
+
+            speak("Hello, I am your robot assistant.")
+        """
+        if self._tts_node is None:
+            return "Error: TTS not initialized"
+
+        # Use lock to prevent simultaneous speech
+        with self._audio_lock:
+            text_subject: Subject[str] = Subject()
+            audio_complete = threading.Event()
+            self._tts_node.consume_text(text_subject)
+
+            def set_as_complete(_t: str) -> None:
+                audio_complete.set()
+
+            def set_as_complete_e(_e: Exception) -> None:
+                audio_complete.set()
+
+            subscription = self._tts_node.emit_text().subscribe(
+                on_next=set_as_complete,
+                on_error=set_as_complete_e,
+            )
+
+            text_subject.on_next(text)
+            text_subject.on_completed()
+
+            timeout = max(5, len(text) * 0.1)
+
+            if not audio_complete.wait(timeout=timeout):
+                logger.warning(f"TTS timeout reached for: {text}")
+                subscription.dispose()
+                return f"Warning: TTS timeout while speaking: {text}"
+            else:
+                # Small delay to ensure buffers flush
+                time.sleep(0.3)
+
+            subscription.dispose()
+
+            return f"Spoke: {text}"
+
+
+speak_skill = SpeakSkill.blueprint
+
+__all__ = ["SpeakSkill", "speak_skill"]
diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py
@@ -66,10 +66,12 @@
     "osm_skill": "dimos.agents2.skills.osm",
     "ros_nav": "dimos.navigation.rosnav",
     "spatial_memory": "dimos.perception.spatial_perception",
+    "speak_skill": "dimos.agents2.skills.speak_skill",
     "unitree_skills": "dimos.robot.unitree_webrtc.unitree_skill_container",
     "utilization": "dimos.utils.monitoring",
     "wavefront_frontier_explorer": "dimos.navigation.frontier_exploration.wavefront_frontier_goal_selector",
     "websocket_vis": "dimos.web.websocket_vis.websocket_vis_module",
+    "web_input": "dimos.agents2.cli.web",
 }
 
 

diff --git a/dimos/robot/unitree_webrtc/unitree_g1_blueprints.py b/dimos/robot/unitree_webrtc/unitree_g1_blueprints.py
@@ -42,7 +42,7 @@
 from dimos.msgs.nav_msgs import Odometry, Path
 from dimos.msgs.sensor_msgs import Image, PointCloud2
 from dimos.msgs.std_msgs import Bool
-from dimos.msgs.vision_msgs import Detection2DArray, Detection3DArray
+from dimos.msgs.vision_msgs import Detection2DArray
 from dimos.navigation.bt_navigator.navigator import (
     behavior_tree_navigator,
 )

diff --git a/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py b/dimos/robot/unitree_webrtc/unitree_go2_blueprints.py
@@ -18,8 +18,10 @@
 
 from dimos.agents2.agent import llm_agent
 from dimos.agents2.cli.human import human_input
+from dimos.agents2.cli.web import web_input
 from dimos.agents2.ollama_agent import ollama_installed
 from dimos.agents2.skills.navigation import navigation_skill
+from dimos.agents2.skills.speak_skill import speak_skill
 from dimos.agents2.spec import Provider
 from dimos.constants import DEFAULT_CAPACITY_COLOR_IMAGE
 from dimos.core.blueprints import autoconnect
@@ -113,6 +115,8 @@
     human_input(),
     navigation_skill(),
     unitree_skills(),
+    web_input(),
+    speak_skill(),
 )
 
 agentic = autoconnect(

diff --git a/dimos/robot/unitree_webrtc/unitree_skill_container.py b/dimos/robot/unitree_webrtc/unitree_skill_container.py
@@ -105,11 +105,6 @@ def current_time(self):  # type: ignore[no-untyped-def]
             yield str(datetime.datetime.now())
             time.sleep(1)
 
-    @skill()
-    def speak(self, text: str) -> str:
-        """Speak text out loud through the robot's speakers."""
-        return f"This is being said aloud: {text}"
-
     @skill()
     def execute_sport_command(self, command_name: str) -> str:
         if self._publish_request is None:

diff --git a/dimos/web/dimos_interface/api/server.py b/dimos/web/dimos_interface/api/server.py
@@ -68,6 +68,7 @@ def __init__(  # type: ignore[no-untyped-def]
         print("Starting FastAPIServer initialization...")  # Debug print
         super().__init__(dev_name, edge_type)
         self.app = FastAPI()
+        self._server: uvicorn.Server | None = None
 
         # Add CORS middleware with more permissive settings for development
         self.app.add_middleware(
@@ -353,10 +354,18 @@ async def text_stream(key: str):  # type: ignore[no-untyped-def]
             self.app.get(f"/video_feed/{key}")(self.create_video_feed_route(key))  # type: ignore[no-untyped-call]
 
     def run(self) -> None:
-        """Run the FastAPI server."""
-        uvicorn.run(
-            self.app, host=self.host, port=self.port
-        )  # TODO: Translate structure to enable in-built workers'
+        config = uvicorn.Config(
+            self.app,
+            host=self.host,
+            port=self.port,
+            log_level="error",  # Reduce verbosity
+        )
+        self._server = uvicorn.Server(config)
+        self._server.run()
+
+    def shutdown(self) -> None:
+        if self._server is not None:
+            self._server.should_exit = True
 
 
 if __name__ == "__main__":