Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions dimos/agents2/cli/web.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Copyright 2025 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from threading import Thread
from typing import TYPE_CHECKING

import reactivex as rx
import reactivex.operators as ops

from dimos.core import Module, rpc
from dimos.core.transport import pLCMTransport
from dimos.stream.audio.node_normalizer import AudioNormalizer
from dimos.stream.audio.stt.node_whisper import WhisperNode
from dimos.utils.logging_config import setup_logger
from dimos.web.robot_web_interface import RobotWebInterface

if TYPE_CHECKING:
from dimos.stream.audio.base import AudioEvent

logger = setup_logger(__name__)


class WebInput(Module):
_web_interface: RobotWebInterface | None = None
_thread: Thread | None = None
_human_transport: pLCMTransport[str] | None = None

@rpc
def start(self) -> None:
super().start()

self._human_transport = pLCMTransport("/human_input")

audio_subject: rx.subject.Subject[AudioEvent] = rx.subject.Subject()

self._web_interface = RobotWebInterface(
port=5555,
text_streams={"agent_responses": rx.subject.Subject()},
audio_subject=audio_subject,
)

normalizer = AudioNormalizer()
stt_node = WhisperNode()

# Connect audio pipeline: browser audio → normalizer → whisper
normalizer.consume_audio(audio_subject.pipe(ops.share()))
stt_node.consume_audio(normalizer.emit_audio())

# Subscribe to both text input sources
# 1. Direct text from web interface
unsub = self._web_interface.query_stream.subscribe(self._human_transport.publish)
self._disposables.add(unsub)

# 2. Transcribed text from STT
unsub = stt_node.emit_text().subscribe(self._human_transport.publish)
self._disposables.add(unsub)

self._thread = Thread(target=self._web_interface.run, daemon=True)
self._thread.start()

logger.info("Web interface started at http://localhost:5555")

@rpc
def stop(self) -> None:
if self._web_interface:
self._web_interface.shutdown()
if self._thread:
self._thread.join(timeout=1.0)
if self._human_transport:
self._human_transport.lcm.stop()
super().stop()


web_input = WebInput.blueprint

__all__ = ["WebInput", "web_input"]
104 changes: 104 additions & 0 deletions dimos/agents2/skills/speak_skill.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Copyright 2025 Dimensional Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import threading
import time

from reactivex import Subject

from dimos.core.core import rpc
from dimos.core.skill_module import SkillModule
from dimos.protocol.skill.skill import skill
from dimos.stream.audio.node_output import SounddeviceAudioOutput
from dimos.stream.audio.tts.node_openai import OpenAITTSNode, Voice
from dimos.utils.logging_config import setup_logger

logger = setup_logger("dimos.agents2.skills.speak_skill")


class SpeakSkill(SkillModule):
_tts_node: OpenAITTSNode | None = None
_audio_output: SounddeviceAudioOutput | None = None
_audio_lock: threading.Lock = threading.Lock()

@rpc
def start(self) -> None:
super().start()
self._tts_node = OpenAITTSNode(speed=1.2, voice=Voice.ONYX)
self._audio_output = SounddeviceAudioOutput(sample_rate=24000)
self._audio_output.consume_audio(self._tts_node.emit_audio())

@rpc
def stop(self) -> None:
if self._tts_node:
self._tts_node.dispose()
self._tts_node = None
if self._audio_output:
self._audio_output.stop()
self._audio_output = None
super().stop()

@skill()
def speak(self, text: str) -> str:
"""Speak text out loud through the robot's speakers.

USE THIS TOOL AS OFTEN AS NEEDED. People can't normally see what you say in text, but can hear what you speak.

Try to be as concise as possible. Remember that speaking takes time, so get to the point quickly.

Example usage:

speak("Hello, I am your robot assistant.")
"""
if self._tts_node is None:
return "Error: TTS not initialized"

# Use lock to prevent simultaneous speech
with self._audio_lock:
text_subject: Subject[str] = Subject()
audio_complete = threading.Event()
self._tts_node.consume_text(text_subject)

def set_as_complete(_t: str) -> None:
audio_complete.set()

def set_as_complete_e(_e: Exception) -> None:
audio_complete.set()

subscription = self._tts_node.emit_text().subscribe(
on_next=set_as_complete,
on_error=set_as_complete_e,
)

text_subject.on_next(text)
text_subject.on_completed()

timeout = max(5, len(text) * 0.1)

if not audio_complete.wait(timeout=timeout):
logger.warning(f"TTS timeout reached for: {text}")
subscription.dispose()
return f"Warning: TTS timeout while speaking: {text}"
else:
# Small delay to ensure buffers flush
time.sleep(0.3)

subscription.dispose()

return f"Spoke: {text}"


speak_skill = SpeakSkill.blueprint

__all__ = ["SpeakSkill", "speak_skill"]
2 changes: 2 additions & 0 deletions dimos/robot/all_blueprints.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,12 @@
"osm_skill": "dimos.agents2.skills.osm",
"ros_nav": "dimos.navigation.rosnav",
"spatial_memory": "dimos.perception.spatial_perception",
"speak_skill": "dimos.agents2.skills.speak_skill",
"unitree_skills": "dimos.robot.unitree_webrtc.unitree_skill_container",
"utilization": "dimos.utils.monitoring",
"wavefront_frontier_explorer": "dimos.navigation.frontier_exploration.wavefront_frontier_goal_selector",
"websocket_vis": "dimos.web.websocket_vis.websocket_vis_module",
"web_input": "dimos.agents2.cli.web",
}


Expand Down
2 changes: 1 addition & 1 deletion dimos/robot/unitree_webrtc/unitree_g1_blueprints.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
from dimos.msgs.nav_msgs import Odometry, Path
from dimos.msgs.sensor_msgs import Image, PointCloud2
from dimos.msgs.std_msgs import Bool
from dimos.msgs.vision_msgs import Detection2DArray, Detection3DArray
from dimos.msgs.vision_msgs import Detection2DArray
from dimos.navigation.bt_navigator.navigator import (
behavior_tree_navigator,
)
Expand Down
4 changes: 4 additions & 0 deletions dimos/robot/unitree_webrtc/unitree_go2_blueprints.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@

from dimos.agents2.agent import llm_agent
from dimos.agents2.cli.human import human_input
from dimos.agents2.cli.web import web_input
from dimos.agents2.ollama_agent import ollama_installed
from dimos.agents2.skills.navigation import navigation_skill
from dimos.agents2.skills.speak_skill import speak_skill
from dimos.agents2.spec import Provider
from dimos.constants import DEFAULT_CAPACITY_COLOR_IMAGE
from dimos.core.blueprints import autoconnect
Expand Down Expand Up @@ -113,6 +115,8 @@
human_input(),
navigation_skill(),
unitree_skills(),
web_input(),
speak_skill(),
)

agentic = autoconnect(
Expand Down
5 changes: 0 additions & 5 deletions dimos/robot/unitree_webrtc/unitree_skill_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,6 @@ def current_time(self): # type: ignore[no-untyped-def]
yield str(datetime.datetime.now())
time.sleep(1)

@skill()
def speak(self, text: str) -> str:
"""Speak text out loud through the robot's speakers."""
return f"This is being said aloud: {text}"

@skill()
def execute_sport_command(self, command_name: str) -> str:
if self._publish_request is None:
Expand Down
17 changes: 13 additions & 4 deletions dimos/web/dimos_interface/api/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def __init__( # type: ignore[no-untyped-def]
print("Starting FastAPIServer initialization...") # Debug print
super().__init__(dev_name, edge_type)
self.app = FastAPI()
self._server: uvicorn.Server | None = None

# Add CORS middleware with more permissive settings for development
self.app.add_middleware(
Expand Down Expand Up @@ -353,10 +354,18 @@ async def text_stream(key: str): # type: ignore[no-untyped-def]
self.app.get(f"/video_feed/{key}")(self.create_video_feed_route(key)) # type: ignore[no-untyped-call]

def run(self) -> None:
"""Run the FastAPI server."""
uvicorn.run(
self.app, host=self.host, port=self.port
) # TODO: Translate structure to enable in-built workers'
config = uvicorn.Config(
self.app,
host=self.host,
port=self.port,
log_level="error", # Reduce verbosity
)
self._server = uvicorn.Server(config)
self._server.run()

def shutdown(self) -> None:
if self._server is not None:
self._server.should_exit = True


if __name__ == "__main__":
Expand Down
Loading