diff --git a/dimos/agents/agent_ctransformers_gguf.py b/dimos/agents/agent_ctransformers_gguf.py deleted file mode 100644 index 17d233437d..0000000000 --- a/dimos/agents/agent_ctransformers_gguf.py +++ /dev/null @@ -1,215 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -# Standard library imports -import logging -import os -from typing import TYPE_CHECKING, Any - -# Third-party imports -from dotenv import load_dotenv -from reactivex import Observable, create -import torch - -# Local imports -from dimos.agents.agent import LLMAgent -from dimos.agents.prompt_builder.impl import PromptBuilder -from dimos.utils.logging_config import setup_logger - -# Initialize environment variables -load_dotenv() - -# Initialize logger for the agent module -logger = setup_logger("dimos.agents", level=logging.DEBUG) - -from ctransformers import AutoModelForCausalLM as CTransformersModel - -if TYPE_CHECKING: - from reactivex.scheduler import ThreadPoolScheduler - from reactivex.subject import Subject - - from dimos.agents.memory.base import AbstractAgentSemanticMemory - - -class CTransformersTokenizerAdapter: - def __init__(self, model) -> None: - self.model = model - - def encode(self, text: str, **kwargs): - return self.model.tokenize(text) - - def decode(self, token_ids, **kwargs): - return self.model.detokenize(token_ids) - - def token_count(self, text: str): - return len(self.tokenize_text(text)) if text else 0 - - def tokenize_text(self, text: str): - return self.model.tokenize(text) - - def detokenize_text(self, tokenized_text): - try: - return self.model.detokenize(tokenized_text) - except Exception as e: - raise ValueError(f"Failed to detokenize text. Error: {e!s}") - - def apply_chat_template( - self, conversation, tokenize: bool = False, add_generation_prompt: bool = True - ): - prompt = "" - for message in conversation: - role = message["role"] - content = message["content"] - if role == "system": - prompt += f"<|system|>\n{content}\n" - elif role == "user": - prompt += f"<|user|>\n{content}\n" - elif role == "assistant": - prompt += f"<|assistant|>\n{content}\n" - if add_generation_prompt: - prompt += "<|assistant|>\n" - return prompt - - -# CTransformers Agent Class -class CTransformersGGUFAgent(LLMAgent): - def __init__( - self, - dev_name: str, - agent_type: str = "HF-LLM", - model_name: str = "TheBloke/Llama-2-7B-GGUF", - model_file: str = "llama-2-7b.Q4_K_M.gguf", - model_type: str = "llama", - gpu_layers: int = 50, - device: str = "auto", - query: str = "How many r's are in the word 'strawberry'?", - input_query_stream: Observable | None = None, - input_video_stream: Observable | None = None, - output_dir: str = os.path.join(os.getcwd(), "assets", "agent"), - agent_memory: AbstractAgentSemanticMemory | None = None, - system_query: str | None = "You are a helpful assistant.", - max_output_tokens_per_request: int = 10, - max_input_tokens_per_request: int = 250, - prompt_builder: PromptBuilder | None = None, - pool_scheduler: ThreadPoolScheduler | None = None, - process_all_inputs: bool | None = None, - ) -> None: - # Determine appropriate default for process_all_inputs if not provided - if process_all_inputs is None: - # Default to True for text queries, False for video streams - if input_query_stream is not None and input_video_stream is None: - process_all_inputs = True - else: - process_all_inputs = False - - super().__init__( - dev_name=dev_name, - agent_type=agent_type, - agent_memory=agent_memory, - pool_scheduler=pool_scheduler, - process_all_inputs=process_all_inputs, - system_query=system_query, - max_output_tokens_per_request=max_output_tokens_per_request, - max_input_tokens_per_request=max_input_tokens_per_request, - ) - - self.query = query - self.output_dir = output_dir - os.makedirs(self.output_dir, exist_ok=True) - - self.model_name = model_name - self.device = device - if self.device == "auto": - self.device = "cuda" if torch.cuda.is_available() else "cpu" - if self.device == "cuda": - print(f"Using GPU: {torch.cuda.get_device_name(0)}") - else: - print("GPU not available, using CPU") - print(f"Device: {self.device}") - - self.model = CTransformersModel.from_pretrained( - model_name, model_file=model_file, model_type=model_type, gpu_layers=gpu_layers - ) - - self.tokenizer = CTransformersTokenizerAdapter(self.model) - - self.prompt_builder = prompt_builder or PromptBuilder( - self.model_name, tokenizer=self.tokenizer - ) - - self.max_output_tokens_per_request = max_output_tokens_per_request - - # self.stream_query(self.query).subscribe(lambda x: print(x)) - - self.input_video_stream = input_video_stream - self.input_query_stream = input_query_stream - - # Ensure only one input stream is provided. - if self.input_video_stream is not None and self.input_query_stream is not None: - raise ValueError( - "More than one input stream provided. Please provide only one input stream." - ) - - if self.input_video_stream is not None: - logger.info("Subscribing to input video stream...") - self.disposables.add(self.subscribe_to_image_processing(self.input_video_stream)) - if self.input_query_stream is not None: - logger.info("Subscribing to input query stream...") - self.disposables.add(self.subscribe_to_query_processing(self.input_query_stream)) - - def _send_query(self, messages: list) -> Any: - try: - _BLUE_PRINT_COLOR: str = "\033[34m" - _RESET_COLOR: str = "\033[0m" - - # === FIX: Flatten message content === - flat_messages = [] - for msg in messages: - role = msg["role"] - content = msg["content"] - if isinstance(content, list): - # Assume it's a list of {'type': 'text', 'text': ...} - text_parts = [c["text"] for c in content if isinstance(c, dict) and "text" in c] - content = " ".join(text_parts) - flat_messages.append({"role": role, "content": content}) - - print(f"{_BLUE_PRINT_COLOR}Messages: {flat_messages}{_RESET_COLOR}") - - print("Applying chat template...") - prompt_text = self.tokenizer.apply_chat_template( - conversation=flat_messages, tokenize=False, add_generation_prompt=True - ) - print("Chat template applied.") - print(f"Prompt text:\n{prompt_text}") - - response = self.model(prompt_text, max_new_tokens=self.max_output_tokens_per_request) - print("Model response received.") - return response - - except Exception as e: - logger.error(f"Error during HuggingFace query: {e}") - return "Error processing request." - - def stream_query(self, query_text: str) -> Subject: - """ - Creates an observable that processes a text query and emits the response. - """ - return create( - lambda observer, _: self._observable_query(observer, incoming_query=query_text) - ) - - -# endregion HuggingFaceLLMAgent Subclass (HuggingFace-Specific Implementation) diff --git a/dimos/agents/agent_huggingface_local.py b/dimos/agents/agent_huggingface_local.py deleted file mode 100644 index 69d02bb1d2..0000000000 --- a/dimos/agents/agent_huggingface_local.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -# Standard library imports -import logging -import os -from typing import TYPE_CHECKING, Any - -# Third-party imports -from dotenv import load_dotenv -from reactivex import Observable, create -import torch -from transformers import AutoModelForCausalLM - -# Local imports -from dimos.agents.agent import LLMAgent -from dimos.agents.memory.chroma_impl import LocalSemanticMemory -from dimos.agents.prompt_builder.impl import PromptBuilder -from dimos.agents.tokenizer.huggingface_tokenizer import HuggingFaceTokenizer -from dimos.utils.logging_config import setup_logger - -if TYPE_CHECKING: - from reactivex.scheduler import ThreadPoolScheduler - from reactivex.subject import Subject - - from dimos.agents.memory.base import AbstractAgentSemanticMemory - from dimos.agents.tokenizer.base import AbstractTokenizer - -# Initialize environment variables -load_dotenv() - -# Initialize logger for the agent module -logger = setup_logger("dimos.agents", level=logging.DEBUG) - - -# HuggingFaceLLMAgent Class -class HuggingFaceLocalAgent(LLMAgent): - def __init__( - self, - dev_name: str, - agent_type: str = "HF-LLM", - model_name: str = "Qwen/Qwen2.5-3B", - device: str = "auto", - query: str = "How many r's are in the word 'strawberry'?", - input_query_stream: Observable | None = None, - input_video_stream: Observable | None = None, - output_dir: str = os.path.join(os.getcwd(), "assets", "agent"), - agent_memory: AbstractAgentSemanticMemory | None = None, - system_query: str | None = None, - max_output_tokens_per_request: int | None = None, - max_input_tokens_per_request: int | None = None, - prompt_builder: PromptBuilder | None = None, - tokenizer: AbstractTokenizer | None = None, - image_detail: str = "low", - pool_scheduler: ThreadPoolScheduler | None = None, - process_all_inputs: bool | None = None, - ) -> None: - # Determine appropriate default for process_all_inputs if not provided - if process_all_inputs is None: - # Default to True for text queries, False for video streams - if input_query_stream is not None and input_video_stream is None: - process_all_inputs = True - else: - process_all_inputs = False - - super().__init__( - dev_name=dev_name, - agent_type=agent_type, - agent_memory=agent_memory or LocalSemanticMemory(), - pool_scheduler=pool_scheduler, - process_all_inputs=process_all_inputs, - system_query=system_query, - ) - - self.query = query - self.output_dir = output_dir - os.makedirs(self.output_dir, exist_ok=True) - - self.model_name = model_name - self.device = device - if self.device == "auto": - self.device = "cuda" if torch.cuda.is_available() else "cpu" - if self.device == "cuda": - print(f"Using GPU: {torch.cuda.get_device_name(0)}") - else: - print("GPU not available, using CPU") - print(f"Device: {self.device}") - - self.tokenizer = tokenizer or HuggingFaceTokenizer(self.model_name) - - self.prompt_builder = prompt_builder or PromptBuilder( - self.model_name, tokenizer=self.tokenizer - ) - - self.model = AutoModelForCausalLM.from_pretrained( - model_name, - torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, - device_map=self.device, - ) - - self.max_output_tokens_per_request = max_output_tokens_per_request - - # self.stream_query(self.query).subscribe(lambda x: print(x)) - - self.input_video_stream = input_video_stream - self.input_query_stream = input_query_stream - - # Ensure only one input stream is provided. - if self.input_video_stream is not None and self.input_query_stream is not None: - raise ValueError( - "More than one input stream provided. Please provide only one input stream." - ) - - if self.input_video_stream is not None: - logger.info("Subscribing to input video stream...") - self.disposables.add(self.subscribe_to_image_processing(self.input_video_stream)) - if self.input_query_stream is not None: - logger.info("Subscribing to input query stream...") - self.disposables.add(self.subscribe_to_query_processing(self.input_query_stream)) - - def _send_query(self, messages: list) -> Any: - _BLUE_PRINT_COLOR: str = "\033[34m" - _RESET_COLOR: str = "\033[0m" - - try: - # Log the incoming messages - print(f"{_BLUE_PRINT_COLOR}Messages: {messages!s}{_RESET_COLOR}") - - # Process with chat template - try: - print("Applying chat template...") - prompt_text = self.tokenizer.tokenizer.apply_chat_template( - conversation=[{"role": "user", "content": str(messages)}], - tokenize=False, - add_generation_prompt=True, - ) - print("Chat template applied.") - - # Tokenize the prompt - print("Preparing model inputs...") - model_inputs = self.tokenizer.tokenizer([prompt_text], return_tensors="pt").to( - self.model.device - ) - print("Model inputs prepared.") - - # Generate the response - print("Generating response...") - generated_ids = self.model.generate( - **model_inputs, max_new_tokens=self.max_output_tokens_per_request - ) - - # Extract the generated tokens (excluding the input prompt tokens) - print("Processing generated output...") - generated_ids = [ - output_ids[len(input_ids) :] - for input_ids, output_ids in zip( - model_inputs.input_ids, generated_ids, strict=False - ) - ] - - # Convert tokens back to text - response = self.tokenizer.tokenizer.batch_decode( - generated_ids, skip_special_tokens=True - )[0] - print("Response successfully generated.") - - return response - - except AttributeError as e: - # Handle case where tokenizer doesn't have the expected methods - logger.warning(f"Chat template not available: {e}. Using simple format.") - # Continue with execution and use simple format - - except Exception as e: - # Log any other errors but continue execution - logger.warning( - f"Error in chat template processing: {e}. Falling back to simple format." - ) - - # Fallback approach for models without chat template support - # This code runs if the try block above raises an exception - print("Using simple prompt format...") - - # Convert messages to a simple text format - if ( - isinstance(messages, list) - and messages - and isinstance(messages[0], dict) - and "content" in messages[0] - ): - prompt_text = messages[0]["content"] - else: - prompt_text = str(messages) - - # Tokenize the prompt - model_inputs = self.tokenizer.tokenize_text(prompt_text) - model_inputs = torch.tensor([model_inputs], device=self.model.device) - - # Generate the response - generated_ids = self.model.generate( - input_ids=model_inputs, max_new_tokens=self.max_output_tokens_per_request - ) - - # Extract the generated tokens - generated_ids = generated_ids[0][len(model_inputs[0]) :] - - # Convert tokens back to text - response = self.tokenizer.detokenize_text(generated_ids.tolist()) - print("Response generated using simple format.") - - return response - - except Exception as e: - # Catch all other errors - logger.error(f"Error during query processing: {e}", exc_info=True) - return "Error processing request. Please try again." - - def stream_query(self, query_text: str) -> Subject: - """ - Creates an observable that processes a text query and emits the response. - """ - return create( - lambda observer, _: self._observable_query(observer, incoming_query=query_text) - ) - - -# endregion HuggingFaceLLMAgent Subclass (HuggingFace-Specific Implementation) diff --git a/dimos/agents/agent_huggingface_remote.py b/dimos/agents/agent_huggingface_remote.py deleted file mode 100644 index 5bb5b293d3..0000000000 --- a/dimos/agents/agent_huggingface_remote.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -# Standard library imports -import logging -import os -from typing import TYPE_CHECKING, Any - -# Third-party imports -from dotenv import load_dotenv -from huggingface_hub import InferenceClient -from reactivex import Observable, create - -# Local imports -from dimos.agents.agent import LLMAgent -from dimos.agents.prompt_builder.impl import PromptBuilder -from dimos.agents.tokenizer.huggingface_tokenizer import HuggingFaceTokenizer -from dimos.utils.logging_config import setup_logger - -if TYPE_CHECKING: - from reactivex.scheduler import ThreadPoolScheduler - from reactivex.subject import Subject - - from dimos.agents.memory.base import AbstractAgentSemanticMemory - from dimos.agents.tokenizer.base import AbstractTokenizer - -# Initialize environment variables -load_dotenv() - -# Initialize logger for the agent module -logger = setup_logger("dimos.agents", level=logging.DEBUG) - - -# HuggingFaceLLMAgent Class -class HuggingFaceRemoteAgent(LLMAgent): - def __init__( - self, - dev_name: str, - agent_type: str = "HF-LLM", - model_name: str = "Qwen/QwQ-32B", - query: str = "How many r's are in the word 'strawberry'?", - input_query_stream: Observable | None = None, - input_video_stream: Observable | None = None, - output_dir: str = os.path.join(os.getcwd(), "assets", "agent"), - agent_memory: AbstractAgentSemanticMemory | None = None, - system_query: str | None = None, - max_output_tokens_per_request: int = 16384, - prompt_builder: PromptBuilder | None = None, - tokenizer: AbstractTokenizer | None = None, - image_detail: str = "low", - pool_scheduler: ThreadPoolScheduler | None = None, - process_all_inputs: bool | None = None, - api_key: str | None = None, - hf_provider: str | None = None, - hf_base_url: str | None = None, - ) -> None: - # Determine appropriate default for process_all_inputs if not provided - if process_all_inputs is None: - # Default to True for text queries, False for video streams - if input_query_stream is not None and input_video_stream is None: - process_all_inputs = True - else: - process_all_inputs = False - - super().__init__( - dev_name=dev_name, - agent_type=agent_type, - agent_memory=agent_memory, - pool_scheduler=pool_scheduler, - process_all_inputs=process_all_inputs, - system_query=system_query, - ) - - self.query = query - self.output_dir = output_dir - os.makedirs(self.output_dir, exist_ok=True) - - self.model_name = model_name - self.prompt_builder = prompt_builder or PromptBuilder( - self.model_name, tokenizer=tokenizer or HuggingFaceTokenizer(self.model_name) - ) - - self.model_name = model_name - - self.max_output_tokens_per_request = max_output_tokens_per_request - - self.api_key = api_key or os.getenv("HF_TOKEN") - self.provider = hf_provider or "hf-inference" - self.base_url = hf_base_url or os.getenv("HUGGINGFACE_PRV_ENDPOINT") - self.client = InferenceClient( - provider=self.provider, - base_url=self.base_url, - api_key=self.api_key, - ) - - # self.stream_query(self.query).subscribe(lambda x: print(x)) - - self.input_video_stream = input_video_stream - self.input_query_stream = input_query_stream - - # Ensure only one input stream is provided. - if self.input_video_stream is not None and self.input_query_stream is not None: - raise ValueError( - "More than one input stream provided. Please provide only one input stream." - ) - - if self.input_video_stream is not None: - logger.info("Subscribing to input video stream...") - self.disposables.add(self.subscribe_to_image_processing(self.input_video_stream)) - if self.input_query_stream is not None: - logger.info("Subscribing to input query stream...") - self.disposables.add(self.subscribe_to_query_processing(self.input_query_stream)) - - def _send_query(self, messages: list) -> Any: - try: - completion = self.client.chat.completions.create( - model=self.model_name, - messages=messages, - max_tokens=self.max_output_tokens_per_request, - ) - - return completion.choices[0].message - except Exception as e: - logger.error(f"Error during HuggingFace query: {e}") - return "Error processing request." - - def stream_query(self, query_text: str) -> Subject: - """ - Creates an observable that processes a text query and emits the response. - """ - return create( - lambda observer, _: self._observable_query(observer, incoming_query=query_text) - ) diff --git a/dimos/agents/cerebras_agent.py b/dimos/agents/cerebras_agent.py deleted file mode 100644 index e58de812d0..0000000000 --- a/dimos/agents/cerebras_agent.py +++ /dev/null @@ -1,613 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Cerebras agent implementation for the DIMOS agent framework. - -This module provides a CerebrasAgent class that implements the LLMAgent interface -for Cerebras inference API using the official Cerebras Python SDK. -""" - -from __future__ import annotations - -import copy -import json -import os -import threading -import time -from typing import TYPE_CHECKING - -from cerebras.cloud.sdk import Cerebras -from dotenv import load_dotenv - -# Local imports -from dimos.agents.agent import LLMAgent -from dimos.agents.prompt_builder.impl import PromptBuilder -from dimos.agents.tokenizer.openai_tokenizer import OpenAITokenizer -from dimos.skills.skills import AbstractSkill, SkillLibrary -from dimos.utils.logging_config import setup_logger - -if TYPE_CHECKING: - from pydantic import BaseModel - from reactivex import Observable - from reactivex.observer import Observer - from reactivex.scheduler import ThreadPoolScheduler - - from dimos.agents.memory.base import AbstractAgentSemanticMemory - from dimos.agents.tokenizer.base import AbstractTokenizer - from dimos.stream.frame_processor import FrameProcessor - -# Initialize environment variables -load_dotenv() - -# Initialize logger for the Cerebras agent -logger = setup_logger("dimos.agents.cerebras") - - -# Response object compatible with LLMAgent -class CerebrasResponseMessage(dict): - def __init__( - self, - content: str = "", - tool_calls=None, - ) -> None: - self.content = content - self.tool_calls = tool_calls or [] - self.parsed = None - - # Initialize as dict with the proper structure - super().__init__(self.to_dict()) - - def __str__(self) -> str: - # Return a string representation for logging - if self.content: - return self.content - elif self.tool_calls: - # Return JSON representation of the first tool call - if self.tool_calls: - tool_call = self.tool_calls[0] - tool_json = { - "name": tool_call.function.name, - "arguments": json.loads(tool_call.function.arguments), - } - return json.dumps(tool_json) - return "[No content]" - - def to_dict(self): - """Convert to dictionary format for JSON serialization.""" - result = {"role": "assistant", "content": self.content or ""} - - if self.tool_calls: - result["tool_calls"] = [] - for tool_call in self.tool_calls: - result["tool_calls"].append( - { - "id": tool_call.id, - "type": "function", - "function": { - "name": tool_call.function.name, - "arguments": tool_call.function.arguments, - }, - } - ) - - return result - - -class CerebrasAgent(LLMAgent): - """Cerebras agent implementation using the official Cerebras Python SDK. - - This class implements the _send_query method to interact with Cerebras API - using their official SDK, allowing most of the LLMAgent logic to be reused. - """ - - def __init__( - self, - dev_name: str, - agent_type: str = "Vision", - query: str = "What do you see?", - input_query_stream: Observable | None = None, - input_video_stream: Observable | None = None, - input_data_stream: Observable | None = None, - output_dir: str = os.path.join(os.getcwd(), "assets", "agent"), - agent_memory: AbstractAgentSemanticMemory | None = None, - system_query: str | None = None, - max_input_tokens_per_request: int = 128000, - max_output_tokens_per_request: int = 16384, - model_name: str = "llama-4-scout-17b-16e-instruct", - skills: AbstractSkill | list[AbstractSkill] | SkillLibrary | None = None, - response_model: BaseModel | None = None, - frame_processor: FrameProcessor | None = None, - image_detail: str = "low", - pool_scheduler: ThreadPoolScheduler | None = None, - process_all_inputs: bool | None = None, - tokenizer: AbstractTokenizer | None = None, - prompt_builder: PromptBuilder | None = None, - ) -> None: - """ - Initializes a new instance of the CerebrasAgent. - - Args: - dev_name (str): The device name of the agent. - agent_type (str): The type of the agent. - query (str): The default query text. - input_query_stream (Observable): An observable for query input. - input_video_stream (Observable): An observable for video frames. - input_data_stream (Observable): An observable for data input. - output_dir (str): Directory for output files. - agent_memory (AbstractAgentSemanticMemory): The memory system. - system_query (str): The system prompt to use with RAG context. - max_input_tokens_per_request (int): Maximum tokens for input. - max_output_tokens_per_request (int): Maximum tokens for output. - model_name (str): The Cerebras model name to use. Available options: - - llama-4-scout-17b-16e-instruct (default, fastest) - - llama3.1-8b - - llama-3.3-70b - - qwen-3-32b - - deepseek-r1-distill-llama-70b (private preview) - skills (Union[AbstractSkill, List[AbstractSkill], SkillLibrary]): Skills available to the agent. - response_model (BaseModel): Optional Pydantic model for structured responses. - frame_processor (FrameProcessor): Custom frame processor. - image_detail (str): Detail level for images ("low", "high", "auto"). - pool_scheduler (ThreadPoolScheduler): The scheduler to use for thread pool operations. - process_all_inputs (bool): Whether to process all inputs or skip when busy. - tokenizer (AbstractTokenizer): The tokenizer for the agent. - prompt_builder (PromptBuilder): The prompt builder for the agent. - """ - # Determine appropriate default for process_all_inputs if not provided - if process_all_inputs is None: - # Default to True for text queries, False for video streams - if input_query_stream is not None and input_video_stream is None: - process_all_inputs = True - else: - process_all_inputs = False - - super().__init__( - dev_name=dev_name, - agent_type=agent_type, - agent_memory=agent_memory, - pool_scheduler=pool_scheduler, - process_all_inputs=process_all_inputs, - system_query=system_query, - input_query_stream=input_query_stream, - input_video_stream=input_video_stream, - input_data_stream=input_data_stream, - ) - - # Initialize Cerebras client - self.client = Cerebras() - - self.query = query - self.output_dir = output_dir - os.makedirs(self.output_dir, exist_ok=True) - - # Initialize conversation history for multi-turn conversations - self.conversation_history = [] - self._history_lock = threading.Lock() - - # Configure skills - self.skills = skills - self.skill_library = None - if isinstance(self.skills, SkillLibrary): - self.skill_library = self.skills - elif isinstance(self.skills, list): - self.skill_library = SkillLibrary() - for skill in self.skills: - self.skill_library.add(skill) - elif isinstance(self.skills, AbstractSkill): - self.skill_library = SkillLibrary() - self.skill_library.add(self.skills) - - self.response_model = response_model - self.model_name = model_name - self.image_detail = image_detail - self.max_output_tokens_per_request = max_output_tokens_per_request - self.max_input_tokens_per_request = max_input_tokens_per_request - self.max_tokens_per_request = max_input_tokens_per_request + max_output_tokens_per_request - - # Add static context to memory. - self._add_context_to_memory() - - # Initialize tokenizer and prompt builder - self.tokenizer = tokenizer or OpenAITokenizer( - model_name="gpt-4o" - ) # Use GPT-4 tokenizer for better accuracy - self.prompt_builder = prompt_builder or PromptBuilder( - model_name=self.model_name, - max_tokens=self.max_input_tokens_per_request, - tokenizer=self.tokenizer, - ) - - logger.info("Cerebras Agent Initialized.") - - def _add_context_to_memory(self) -> None: - """Adds initial context to the agent's memory.""" - context_data = [ - ( - "id0", - "Optical Flow is a technique used to track the movement of objects in a video sequence.", - ), - ( - "id1", - "Edge Detection is a technique used to identify the boundaries of objects in an image.", - ), - ("id2", "Video is a sequence of frames captured at regular intervals."), - ( - "id3", - "Colors in Optical Flow are determined by the movement of light, and can be used to track the movement of objects.", - ), - ( - "id4", - "Json is a data interchange format that is easy for humans to read and write, and easy for machines to parse and generate.", - ), - ] - for doc_id, text in context_data: - self.agent_memory.add_vector(doc_id, text) - - def _build_prompt( - self, - messages: list, - base64_image: str | list[str] | None = None, - dimensions: tuple[int, int] | None = None, - override_token_limit: bool = False, - condensed_results: str = "", - ) -> list: - """Builds a prompt message specifically for Cerebras API. - - Args: - messages (list): Existing messages list to build upon. - base64_image (Union[str, List[str]]): Optional Base64-encoded image(s). - dimensions (Tuple[int, int]): Optional image dimensions. - override_token_limit (bool): Whether to override token limits. - condensed_results (str): The condensed RAG context. - - Returns: - list: Messages formatted for Cerebras API. - """ - # Add system message if provided and not already in history - if self.system_query and (not messages or messages[0].get("role") != "system"): - messages.insert(0, {"role": "system", "content": self.system_query}) - logger.info("Added system message to conversation") - - # Append user query while handling RAG - if condensed_results: - user_message = {"role": "user", "content": f"{condensed_results}\n\n{self.query}"} - logger.info("Created user message with RAG context") - else: - user_message = {"role": "user", "content": self.query} - - messages.append(user_message) - - if base64_image is not None: - # Handle both single image (str) and multiple images (List[str]) - images = [base64_image] if isinstance(base64_image, str) else base64_image - - # For Cerebras, we'll add images inline with text (OpenAI-style format) - for img in images: - img_content = [ - {"type": "text", "text": "Here is an image to analyze:"}, - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{img}", - "detail": self.image_detail, - }, - }, - ] - messages.append({"role": "user", "content": img_content}) - - logger.info(f"Added {len(images)} image(s) to conversation") - - # Use new truncation function - messages = self._truncate_messages(messages, override_token_limit) - - return messages - - def _truncate_messages(self, messages: list, override_token_limit: bool = False) -> list: - """Truncate messages if total tokens exceed 16k using existing truncate_tokens method. - - Args: - messages (list): List of message dictionaries - override_token_limit (bool): Whether to skip truncation - - Returns: - list: Messages with content truncated if needed - """ - if override_token_limit: - return messages - - total_tokens = 0 - for message in messages: - if isinstance(message.get("content"), str): - total_tokens += self.prompt_builder.tokenizer.token_count(message["content"]) - elif isinstance(message.get("content"), list): - for item in message["content"]: - if item.get("type") == "text": - total_tokens += self.prompt_builder.tokenizer.token_count(item["text"]) - elif item.get("type") == "image_url": - total_tokens += 85 - - if total_tokens > 16000: - excess_tokens = total_tokens - 16000 - current_tokens = total_tokens - - # Start from oldest messages and truncate until under 16k - for i in range(len(messages)): - if current_tokens <= 16000: - break - - msg = messages[i] - if msg.get("role") == "system": - continue - - if isinstance(msg.get("content"), str): - original_tokens = self.prompt_builder.tokenizer.token_count(msg["content"]) - # Calculate how much to truncate from this message - tokens_to_remove = min(excess_tokens, original_tokens // 3) - new_max_tokens = max(50, original_tokens - tokens_to_remove) - - msg["content"] = self.prompt_builder.truncate_tokens( - msg["content"], new_max_tokens, "truncate_end" - ) - - new_tokens = self.prompt_builder.tokenizer.token_count(msg["content"]) - tokens_saved = original_tokens - new_tokens - current_tokens -= tokens_saved - excess_tokens -= tokens_saved - - logger.info( - f"Truncated older messages using truncate_tokens, final tokens: {current_tokens}" - ) - else: - logger.info(f"No truncation needed, total tokens: {total_tokens}") - - return messages - - def clean_cerebras_schema(self, schema: dict) -> dict: - """Simple schema cleaner that removes unsupported fields for Cerebras API.""" - if not isinstance(schema, dict): - return schema - - # Removing the problematic fields that pydantic generates - cleaned = {} - unsupported_fields = { - "minItems", - "maxItems", - "uniqueItems", - "exclusiveMinimum", - "exclusiveMaximum", - "minimum", - "maximum", - } - - for key, value in schema.items(): - if key in unsupported_fields: - continue # Skip unsupported fields - elif isinstance(value, dict): - cleaned[key] = self.clean_cerebras_schema(value) - elif isinstance(value, list): - cleaned[key] = [ - self.clean_cerebras_schema(item) if isinstance(item, dict) else item - for item in value - ] - else: - cleaned[key] = value - - return cleaned - - def create_tool_call( - self, - name: str | None = None, - arguments: dict | None = None, - call_id: str | None = None, - content: str | None = None, - ): - """Create a tool call object from either direct parameters or JSON content.""" - # If content is provided, parse it as JSON - if content: - logger.info(f"Creating tool call from content: {content}") - try: - content_json = json.loads(content) - if ( - isinstance(content_json, dict) - and "name" in content_json - and "arguments" in content_json - ): - name = content_json["name"] - arguments = content_json["arguments"] - else: - return None - except json.JSONDecodeError: - logger.warning("Content appears to be JSON but failed to parse") - return None - - # Create the tool call object - if name and arguments is not None: - timestamp = int(time.time() * 1000000) # microsecond precision - tool_id = f"call_{timestamp}" - - logger.info(f"Creating tool call with timestamp ID: {tool_id}") - return type( - "ToolCall", - (), - { - "id": tool_id, - "function": type( - "Function", (), {"name": name, "arguments": json.dumps(arguments)} - ), - }, - ) - - return None - - def _send_query(self, messages: list) -> CerebrasResponseMessage: - """Sends the query to Cerebras API using the official Cerebras SDK. - - Args: - messages (list): The prompt messages to send. - - Returns: - The response message from Cerebras wrapped in our CerebrasResponseMessage class. - - Raises: - Exception: If no response message is returned from the API. - ConnectionError: If there's an issue connecting to the API. - ValueError: If the messages or other parameters are invalid. - """ - try: - # Prepare API call parameters - api_params = { - "model": self.model_name, - "messages": messages, - # "max_tokens": self.max_output_tokens_per_request, - } - - # Add tools if available - if self.skill_library and self.skill_library.get_tools(): - tools = self.skill_library.get_tools() - for tool in tools: - if "function" in tool and "parameters" in tool["function"]: - tool["function"]["parameters"] = self.clean_cerebras_schema( - tool["function"]["parameters"] - ) - api_params["tools"] = tools - api_params["tool_choice"] = "auto" - - if self.response_model is not None: - api_params["response_format"] = { - "type": "json_object", - "schema": self.response_model, - } - - # Make the API call - response = self.client.chat.completions.create(**api_params) - - raw_message = response.choices[0].message - if raw_message is None: - logger.error("Response message does not exist.") - raise Exception("Response message does not exist.") - - # Process response into final format - content = raw_message.content - tool_calls = getattr(raw_message, "tool_calls", None) - - # If no structured tool calls from API, try parsing content as JSON tool call - if not tool_calls and content and content.strip().startswith("{"): - parsed_tool_call = self.create_tool_call(content=content) - if parsed_tool_call: - tool_calls = [parsed_tool_call] - content = None - - return CerebrasResponseMessage(content=content, tool_calls=tool_calls) - - except ConnectionError as ce: - logger.error(f"Connection error with Cerebras API: {ce}") - raise - except ValueError as ve: - logger.error(f"Invalid parameters for Cerebras API: {ve}") - raise - except Exception as e: - # Print the raw API parameters when an error occurs - logger.error(f"Raw API parameters: {json.dumps(api_params, indent=2)}") - logger.error(f"Unexpected error in Cerebras API call: {e}") - raise - - def _observable_query( - self, - observer: Observer, - base64_image: str | None = None, - dimensions: tuple[int, int] | None = None, - override_token_limit: bool = False, - incoming_query: str | None = None, - reset_conversation: bool = False, - ): - """Main query handler that manages conversation history and Cerebras interactions. - - This method follows ClaudeAgent's pattern for efficient conversation history management. - - Args: - observer (Observer): The observer to emit responses to. - base64_image (str): Optional Base64-encoded image. - dimensions (Tuple[int, int]): Optional image dimensions. - override_token_limit (bool): Whether to override token limits. - incoming_query (str): Optional query to update the agent's query. - reset_conversation (bool): Whether to reset the conversation history. - """ - try: - # Reset conversation history if requested - if reset_conversation: - self.conversation_history = [] - logger.info("Conversation history reset") - - # Create a local copy of conversation history and record its length - messages = copy.deepcopy(self.conversation_history) - - # Update query and get context - self._update_query(incoming_query) - _, condensed_results = self._get_rag_context() - - # Build prompt - messages = self._build_prompt( - messages, base64_image, dimensions, override_token_limit, condensed_results - ) - - while True: - logger.info("Sending Query.") - response_message = self._send_query(messages) - logger.info(f"Received Response: {response_message}") - - if response_message is None: - raise Exception("Response message does not exist.") - - # If no skill library or no tool calls, we're done - if ( - self.skill_library is None - or self.skill_library.get_tools() is None - or response_message.tool_calls is None - ): - final_msg = ( - response_message.parsed - if hasattr(response_message, "parsed") and response_message.parsed - else ( - response_message.content - if hasattr(response_message, "content") - else response_message - ) - ) - messages.append(response_message) - break - - logger.info(f"Assistant requested {len(response_message.tool_calls)} tool call(s)") - next_response = self._handle_tooling(response_message, messages) - - if next_response is None: - final_msg = response_message.content or "" - break - - response_message = next_response - - with self._history_lock: - self.conversation_history = messages - logger.info( - f"Updated conversation history (total: {len(self.conversation_history)} messages)" - ) - - # Emit the final message content to the observer - observer.on_next(final_msg) - self.response_subject.on_next(final_msg) - observer.on_completed() - - except Exception as e: - logger.error(f"Query failed in {self.dev_name}: {e}") - observer.on_error(e) - self.response_subject.on_error(e) diff --git a/dimos/agents/modules/agent_pool.py b/dimos/agents/modules/agent_pool.py deleted file mode 100644 index 08ef943765..0000000000 --- a/dimos/agents/modules/agent_pool.py +++ /dev/null @@ -1,232 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Agent pool module for managing multiple agents.""" - -from typing import Any - -from reactivex import operators as ops -from reactivex.subject import Subject - -from dimos.agents.modules.base_agent import BaseAgentModule -from dimos.agents.modules.unified_agent import UnifiedAgentModule -from dimos.core import In, Module, Out, rpc -from dimos.utils.logging_config import setup_logger - -logger = setup_logger("dimos.agents.modules.agent_pool") - - -class AgentPoolModule(Module): - """Lightweight agent pool for managing multiple agents. - - This module enables: - - Multiple agent deployment with different configurations - - Query routing based on agent ID or capabilities - - Load balancing across agents - - Response aggregation from multiple agents - """ - - # Module I/O - query_in: In[dict[str, Any]] = None # {agent_id: str, query: str, ...} - response_out: Out[dict[str, Any]] = None # {agent_id: str, response: str, ...} - - def __init__( - self, agents_config: dict[str, dict[str, Any]], default_agent: str | None = None - ) -> None: - """Initialize agent pool. - - Args: - agents_config: Configuration for each agent - { - "agent_id": { - "model": "openai::gpt-4o", - "skills": SkillLibrary(), - "system_prompt": "...", - ... - } - } - default_agent: Default agent ID to use if not specified - """ - super().__init__() - - self._config = agents_config - self._default_agent = default_agent or next(iter(agents_config.keys())) - self._agents = {} - - # Response routing - self._response_subject = Subject() - - @rpc - def start(self) -> None: - """Deploy and start all agents.""" - super().start() - logger.info(f"Starting agent pool with {len(self._config)} agents") - - # Deploy agents based on config - for agent_id, config in self._config.items(): - logger.info(f"Deploying agent: {agent_id}") - - # Determine agent type - agent_type = config.pop("type", "unified") - - if agent_type == "base": - agent = BaseAgentModule(**config) - else: - agent = UnifiedAgentModule(**config) - - # Start the agent - agent.start() - - # Store agent with metadata - self._agents[agent_id] = {"module": agent, "config": config, "type": agent_type} - - # Subscribe to agent responses - self._setup_agent_routing(agent_id, agent) - - # Subscribe to incoming queries - if self.query_in: - self._disposables.add(self.query_in.observable().subscribe(self._route_query)) - - # Connect response subject to output - if self.response_out: - self._disposables.add(self._response_subject.subscribe(self.response_out.publish)) - - logger.info("Agent pool started") - - @rpc - def stop(self) -> None: - """Stop all agents.""" - logger.info("Stopping agent pool") - - # Stop all agents - for agent_id, agent_info in self._agents.items(): - try: - agent_info["module"].stop() - except Exception as e: - logger.error(f"Error stopping agent {agent_id}: {e}") - - # Clear agents - self._agents.clear() - super().stop() - - @rpc - def add_agent(self, agent_id: str, config: dict[str, Any]) -> None: - """Add a new agent to the pool.""" - if agent_id in self._agents: - logger.warning(f"Agent {agent_id} already exists") - return - - # Deploy and start agent - agent_type = config.pop("type", "unified") - - if agent_type == "base": - agent = BaseAgentModule(**config) - else: - agent = UnifiedAgentModule(**config) - - agent.start() - - # Store and setup routing - self._agents[agent_id] = {"module": agent, "config": config, "type": agent_type} - self._setup_agent_routing(agent_id, agent) - - logger.info(f"Added agent: {agent_id}") - - @rpc - def remove_agent(self, agent_id: str) -> None: - """Remove an agent from the pool.""" - if agent_id not in self._agents: - logger.warning(f"Agent {agent_id} not found") - return - - # Stop and remove agent - agent_info = self._agents[agent_id] - agent_info["module"].stop() - del self._agents[agent_id] - - logger.info(f"Removed agent: {agent_id}") - - @rpc - def list_agents(self) -> list[dict[str, Any]]: - """List all agents and their configurations.""" - return [ - {"id": agent_id, "type": info["type"], "model": info["config"].get("model", "unknown")} - for agent_id, info in self._agents.items() - ] - - @rpc - def broadcast_query(self, query: str, exclude: list[str] | None = None) -> None: - """Send query to all agents (except excluded ones).""" - exclude = exclude or [] - - for agent_id, agent_info in self._agents.items(): - if agent_id not in exclude: - agent_info["module"].query_in.publish(query) - - logger.info(f"Broadcasted query to {len(self._agents) - len(exclude)} agents") - - def _setup_agent_routing( - self, agent_id: str, agent: BaseAgentModule | UnifiedAgentModule - ) -> None: - """Setup response routing for an agent.""" - - # Subscribe to agent responses and tag with agent_id - def tag_response(response: str) -> dict[str, Any]: - return { - "agent_id": agent_id, - "response": response, - "type": self._agents[agent_id]["type"], - } - - self._disposables.add( - agent.response_out.observable() - .pipe(ops.map(tag_response)) - .subscribe(self._response_subject.on_next) - ) - - def _route_query(self, msg: dict[str, Any]) -> None: - """Route incoming query to appropriate agent(s).""" - # Extract routing info - agent_id = msg.get("agent_id", self._default_agent) - query = msg.get("query", "") - broadcast = msg.get("broadcast", False) - - if broadcast: - # Send to all agents - exclude = msg.get("exclude", []) - self.broadcast_query(query, exclude) - elif agent_id == "round_robin": - # Simple round-robin routing - agent_ids = list(self._agents.keys()) - if agent_ids: - # Use query hash for consistent routing - idx = hash(query) % len(agent_ids) - selected_agent = agent_ids[idx] - self._agents[selected_agent]["module"].query_in.publish(query) - logger.debug(f"Routed to {selected_agent} (round-robin)") - elif agent_id in self._agents: - # Route to specific agent - self._agents[agent_id]["module"].query_in.publish(query) - logger.debug(f"Routed to {agent_id}") - else: - logger.warning(f"Unknown agent ID: {agent_id}, using default: {self._default_agent}") - if self._default_agent in self._agents: - self._agents[self._default_agent]["module"].query_in.publish(query) - - # Handle additional routing options - if "image" in msg and hasattr(self._agents.get(agent_id, {}).get("module"), "image_in"): - self._agents[agent_id]["module"].image_in.publish(msg["image"]) - - if "data" in msg and hasattr(self._agents.get(agent_id, {}).get("module"), "data_in"): - self._agents[agent_id]["module"].data_in.publish(msg["data"]) diff --git a/dimos/agents/modules/simple_vision_agent.py b/dimos/agents/modules/simple_vision_agent.py deleted file mode 100644 index b4888fd073..0000000000 --- a/dimos/agents/modules/simple_vision_agent.py +++ /dev/null @@ -1,238 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Simple vision agent module following exact DimOS patterns.""" - -import asyncio -import base64 -import io -import threading - -import numpy as np -from PIL import Image as PILImage -from reactivex.disposable import Disposable - -from dimos.agents.modules.gateway import UnifiedGatewayClient -from dimos.core import In, Module, Out, rpc -from dimos.msgs.sensor_msgs import Image -from dimos.utils.logging_config import setup_logger - -logger = setup_logger(__file__) - - -class SimpleVisionAgentModule(Module): - """Simple vision agent that can process images with text queries. - - This follows the exact pattern from working modules without any extras. - """ - - # Module I/O - query_in: In[str] = None - image_in: In[Image] = None - response_out: Out[str] = None - - def __init__( - self, - model: str = "openai::gpt-4o-mini", - system_prompt: str | None = None, - temperature: float = 0.0, - max_tokens: int = 4096, - ) -> None: - """Initialize the vision agent. - - Args: - model: Model identifier (e.g., "openai::gpt-4o-mini") - system_prompt: System prompt for the agent - temperature: Sampling temperature - max_tokens: Maximum tokens to generate - """ - super().__init__() - - self.model = model - self.system_prompt = system_prompt or "You are a helpful vision AI assistant." - self.temperature = temperature - self.max_tokens = max_tokens - - # State - self.gateway = None - self._latest_image = None - self._processing = False - self._lock = threading.Lock() - - @rpc - def start(self) -> None: - """Initialize and start the agent.""" - super().start() - - logger.info(f"Starting simple vision agent with model: {self.model}") - - # Initialize gateway - self.gateway = UnifiedGatewayClient() - - # Subscribe to inputs - if self.query_in: - unsub = self.query_in.subscribe(self._handle_query) - self._disposables.add(Disposable(unsub)) - - if self.image_in: - unsub = self.image_in.subscribe(self._handle_image) - self._disposables.add(Disposable(unsub)) - - logger.info("Simple vision agent started") - - @rpc - def stop(self) -> None: - logger.info("Stopping simple vision agent") - if self.gateway: - self.gateway.close() - - super().stop() - - def _handle_image(self, image: Image) -> None: - """Handle incoming image.""" - logger.info( - f"Received new image: {image.data.shape if hasattr(image, 'data') else 'unknown shape'}" - ) - self._latest_image = image - - def _handle_query(self, query: str) -> None: - """Handle text query.""" - with self._lock: - if self._processing: - logger.warning("Already processing, skipping query") - return - self._processing = True - - # Process in thread - thread = threading.Thread(target=self._run_async_query, args=(query,)) - thread.daemon = True - thread.start() - - def _run_async_query(self, query: str) -> None: - """Run async query in new event loop.""" - asyncio.run(self._process_query(query)) - - async def _process_query(self, query: str) -> None: - """Process the query.""" - try: - logger.info(f"Processing query: {query}") - - # Build messages - messages = [{"role": "system", "content": self.system_prompt}] - - # Check if we have an image - if self._latest_image: - logger.info("Have latest image, encoding...") - image_b64 = self._encode_image(self._latest_image) - if image_b64: - logger.info(f"Image encoded successfully, size: {len(image_b64)} bytes") - # Add user message with image - if "anthropic" in self.model: - # Anthropic format - messages.append( - { - "role": "user", - "content": [ - {"type": "text", "text": query}, - { - "type": "image", - "source": { - "type": "base64", - "media_type": "image/jpeg", - "data": image_b64, - }, - }, - ], - } - ) - else: - # OpenAI format - messages.append( - { - "role": "user", - "content": [ - {"type": "text", "text": query}, - { - "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_b64}", - "detail": "auto", - }, - }, - ], - } - ) - else: - # No image encoding, just text - logger.warning("Failed to encode image") - messages.append({"role": "user", "content": query}) - else: - # No image at all - logger.warning("No image available") - messages.append({"role": "user", "content": query}) - - # Make inference call - response = await self.gateway.ainference( - model=self.model, - messages=messages, - temperature=self.temperature, - max_tokens=self.max_tokens, - stream=False, - ) - - # Extract response - message = response["choices"][0]["message"] - content = message.get("content", "") - - # Emit response - if self.response_out and content: - self.response_out.publish(content) - - except Exception as e: - logger.error(f"Error processing query: {e}") - import traceback - - traceback.print_exc() - if self.response_out: - self.response_out.publish(f"Error: {e!s}") - finally: - with self._lock: - self._processing = False - - def _encode_image(self, image: Image) -> str | None: - """Encode image to base64.""" - try: - # Convert to numpy array if needed - if hasattr(image, "data"): - img_array = image.data - else: - img_array = np.array(image) - - # Convert to PIL Image - pil_image = PILImage.fromarray(img_array) - - # Convert to RGB if needed - if pil_image.mode != "RGB": - pil_image = pil_image.convert("RGB") - - # Encode to base64 - buffer = io.BytesIO() - pil_image.save(buffer, format="JPEG") - img_b64 = base64.b64encode(buffer.getvalue()).decode("utf-8") - - return img_b64 - - except Exception as e: - logger.error(f"Failed to encode image: {e}") - return None diff --git a/dimos/agents/planning_agent.py b/dimos/agents/planning_agent.py deleted file mode 100644 index 6dbdbf5866..0000000000 --- a/dimos/agents/planning_agent.py +++ /dev/null @@ -1,318 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from textwrap import dedent -import threading -import time -from typing import Literal - -from pydantic import BaseModel -from reactivex import Observable, operators as ops - -from dimos.agents.agent import OpenAIAgent -from dimos.skills.skills import AbstractSkill -from dimos.utils.logging_config import setup_logger - -logger = setup_logger("dimos.agents.planning_agent") - - -# For response validation -class PlanningAgentResponse(BaseModel): - type: Literal["dialogue", "plan"] - content: list[str] - needs_confirmation: bool - - -class PlanningAgent(OpenAIAgent): - """Agent that plans and breaks down tasks through dialogue. - - This agent specializes in: - 1. Understanding complex tasks through dialogue - 2. Breaking tasks into concrete, executable steps - 3. Refining plans based on user feedback - 4. Streaming individual steps to ExecutionAgents - - The agent maintains conversation state and can refine plans until - the user confirms they are ready to execute. - """ - - def __init__( - self, - dev_name: str = "PlanningAgent", - model_name: str = "gpt-4", - input_query_stream: Observable | None = None, - use_terminal: bool = False, - skills: AbstractSkill | None = None, - ) -> None: - """Initialize the planning agent. - - Args: - dev_name: Name identifier for the agent - model_name: OpenAI model to use - input_query_stream: Observable stream of user queries - use_terminal: Whether to enable terminal input - skills: Available skills/functions for the agent - """ - # Planning state - self.conversation_history = [] - self.current_plan = [] - self.plan_confirmed = False - self.latest_response = None - - # Build system prompt - skills_list = [] - if skills is not None: - skills_list = skills.get_tools() - - system_query = dedent(f""" - You are a Robot planning assistant that helps break down tasks into concrete, executable steps. - Your goal is to: - 1. Break down the task into clear, sequential steps - 2. Refine the plan based on user feedback as needed - 3. Only finalize the plan when the user explicitly confirms - - You have the following skills at your disposal: - {skills_list} - - IMPORTANT: You MUST ALWAYS respond with ONLY valid JSON in the following format, with no additional text or explanation: - {{ - "type": "dialogue" | "plan", - "content": string | list[string], - "needs_confirmation": boolean - }} - - Your goal is to: - 1. Understand the user's task through dialogue - 2. Break it down into clear, sequential steps - 3. Refine the plan based on user feedback - 4. Only finalize the plan when the user explicitly confirms - - For dialogue responses, use: - {{ - "type": "dialogue", - "content": "Your message to the user", - "needs_confirmation": false - }} - - For plan proposals, use: - {{ - "type": "plan", - "content": ["Execute", "Execute", ...], - "needs_confirmation": true - }} - - Remember: ONLY output valid JSON, no other text.""") - - # Initialize OpenAIAgent with our configuration - super().__init__( - dev_name=dev_name, - agent_type="Planning", - query="", # Will be set by process_user_input - model_name=model_name, - input_query_stream=input_query_stream, - system_query=system_query, - max_output_tokens_per_request=1000, - response_model=PlanningAgentResponse, - ) - logger.info("Planning agent initialized") - - # Set up terminal mode if requested - self.use_terminal = use_terminal - use_terminal = False - if use_terminal: - # Start terminal interface in a separate thread - logger.info("Starting terminal interface in a separate thread") - terminal_thread = threading.Thread(target=self.start_terminal_interface, daemon=True) - terminal_thread.start() - - def _handle_response(self, response) -> None: - """Handle the agent's response and update state. - - Args: - response: ParsedChatCompletionMessage containing PlanningAgentResponse - """ - print("handle response", response) - print("handle response type", type(response)) - - # Extract the PlanningAgentResponse from parsed field if available - planning_response = response.parsed if hasattr(response, "parsed") else response - print("planning response", planning_response) - print("planning response type", type(planning_response)) - # Convert to dict for storage in conversation history - response_dict = planning_response.model_dump() - self.conversation_history.append(response_dict) - - # If it's a plan, update current plan - if planning_response.type == "plan": - logger.info(f"Updating current plan: {planning_response.content}") - self.current_plan = planning_response.content - - # Store latest response - self.latest_response = response_dict - - def _stream_plan(self) -> None: - """Stream each step of the confirmed plan.""" - logger.info("Starting to stream plan steps") - logger.debug(f"Current plan: {self.current_plan}") - - for i, step in enumerate(self.current_plan, 1): - logger.info(f"Streaming step {i}: {step}") - # Add a small delay between steps to ensure they're processed - time.sleep(0.5) - try: - self.response_subject.on_next(str(step)) - logger.debug(f"Successfully emitted step {i} to response_subject") - except Exception as e: - logger.error(f"Error emitting step {i}: {e}") - - logger.info("Plan streaming completed") - self.response_subject.on_completed() - - def _send_query(self, messages: list) -> PlanningAgentResponse: - """Send query to OpenAI and parse the response. - - Extends OpenAIAgent's _send_query to handle planning-specific response formats. - - Args: - messages: List of message dictionaries - - Returns: - PlanningAgentResponse: Validated response with type, content, and needs_confirmation - """ - try: - return super()._send_query(messages) - except Exception as e: - logger.error(f"Caught exception in _send_query: {e!s}") - return PlanningAgentResponse( - type="dialogue", content=f"Error: {e!s}", needs_confirmation=False - ) - - def process_user_input(self, user_input: str) -> None: - """Process user input and generate appropriate response. - - Args: - user_input: The user's message - """ - if not user_input: - return - - # Check for plan confirmation - if self.current_plan and user_input.lower() in ["yes", "y", "confirm"]: - logger.info("Plan confirmation received") - self.plan_confirmed = True - # Create a proper PlanningAgentResponse with content as a list - confirmation_msg = PlanningAgentResponse( - type="dialogue", - content="Plan confirmed! Streaming steps to execution...", - needs_confirmation=False, - ) - self._handle_response(confirmation_msg) - self._stream_plan() - return - - # Build messages for OpenAI with conversation history - messages = [ - {"role": "system", "content": self.system_query} # Using system_query from OpenAIAgent - ] - - # Add the new user input to conversation history - self.conversation_history.append({"type": "user_message", "content": user_input}) - - # Add complete conversation history including both user and assistant messages - for msg in self.conversation_history: - if msg["type"] == "user_message": - messages.append({"role": "user", "content": msg["content"]}) - elif msg["type"] == "dialogue": - messages.append({"role": "assistant", "content": msg["content"]}) - elif msg["type"] == "plan": - plan_text = "Here's my proposed plan:\n" + "\n".join( - f"{i + 1}. {step}" for i, step in enumerate(msg["content"]) - ) - messages.append({"role": "assistant", "content": plan_text}) - - # Get and handle response - response = self._send_query(messages) - self._handle_response(response) - - def start_terminal_interface(self) -> None: - """Start the terminal interface for input/output.""" - - time.sleep(5) # buffer time for clean terminal interface printing - print("=" * 50) - print("\nDimOS Action PlanningAgent\n") - print("I have access to your Robot() and Robot Skills()") - print( - "Describe your task and I'll break it down into steps using your skills as a reference." - ) - print("Once you're happy with the plan, type 'yes' to execute it.") - print("Type 'quit' to exit.\n") - - while True: - try: - print("=" * 50) - user_input = input("USER > ") - if user_input.lower() in ["quit", "exit"]: - break - - self.process_user_input(user_input) - - # Display response - if self.latest_response["type"] == "dialogue": - print(f"\nPlanner: {self.latest_response['content']}") - elif self.latest_response["type"] == "plan": - print("\nProposed Plan:") - for i, step in enumerate(self.latest_response["content"], 1): - print(f"{i}. {step}") - if self.latest_response["needs_confirmation"]: - print("\nDoes this plan look good? (yes/no)") - - if self.plan_confirmed: - print("\nPlan confirmed! Streaming steps to execution...") - break - - except KeyboardInterrupt: - print("\nStopping...") - break - except Exception as e: - print(f"\nError: {e}") - break - - def get_response_observable(self) -> Observable: - """Gets an observable that emits responses from this agent. - - This method processes the response stream from the parent class, - extracting content from `PlanningAgentResponse` objects and flattening - any lists of plan steps for emission. - - Returns: - Observable: An observable that emits plan steps from the agent. - """ - - def extract_content(response) -> list[str]: - if isinstance(response, PlanningAgentResponse): - if response.type == "plan": - return response.content # List of steps to be emitted individually - else: # dialogue type - return [response.content] # Wrap single dialogue message in a list - else: - return [str(response)] # Wrap non-PlanningAgentResponse in a list - - # Get base observable from parent class - base_observable = super().get_response_observable() - - # Process the stream: extract content and flatten plan lists - return base_observable.pipe( - ops.map(extract_content), - ops.flat_map(lambda items: items), # Flatten the list of items - ) diff --git a/dimos/agents/test_agent_image_message.py b/dimos/agents/test_agent_image_message.py deleted file mode 100644 index c7f84bcefe..0000000000 --- a/dimos/agents/test_agent_image_message.py +++ /dev/null @@ -1,403 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Test BaseAgent with AgentMessage containing images.""" - -import logging -import os - -from dotenv import load_dotenv -import numpy as np -import pytest - -from dimos.agents.agent_message import AgentMessage -from dimos.agents.modules.base import BaseAgent -from dimos.msgs.sensor_msgs import Image -from dimos.msgs.sensor_msgs.Image import ImageFormat -from dimos.utils.logging_config import setup_logger - -logger = setup_logger("test_agent_image_message") -# Enable debug logging for base module -logging.getLogger("dimos.agents.modules.base").setLevel(logging.DEBUG) - - -@pytest.mark.tofix -def test_agent_single_image() -> None: - """Test agent with single image in AgentMessage.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - # Create agent - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful vision assistant. Describe what you see concisely.", - temperature=0.0, - seed=42, - ) - - # Create AgentMessage with text and single image - msg = AgentMessage() - msg.add_text("What color is this image?") - - # Create a solid red image in RGB format for clarity - red_data = np.zeros((100, 100, 3), dtype=np.uint8) - red_data[:, :, 0] = 255 # R channel (index 0 in RGB) - red_data[:, :, 1] = 0 # G channel (index 1 in RGB) - red_data[:, :, 2] = 0 # B channel (index 2 in RGB) - # Explicitly specify RGB format to avoid confusion - red_img = Image.from_numpy(red_data, format=ImageFormat.RGB) - print(f"[Test] Created image format: {red_img.format}, shape: {red_img.data.shape}") - msg.add_image(red_img) - - # Query - response = agent.query(msg) - print(f"\n[Test] Single image response: '{response.content}'") - - # Verify response - assert response.content is not None - # The model should mention a color or describe the image - response_lower = response.content.lower() - # Accept any color mention since models may see colors differently - color_mentioned = any( - word in response_lower - for word in ["red", "blue", "color", "solid", "image", "shade", "hue"] - ) - assert color_mentioned, f"Expected color description in response, got: {response.content}" - - # Check conversation history - assert agent.conversation.size() == 2 - # User message should have content array - history = agent.conversation.to_openai_format() - user_msg = history[0] - assert user_msg["role"] == "user" - assert isinstance(user_msg["content"], list), "Multimodal message should have content array" - assert len(user_msg["content"]) == 2 # text + image - assert user_msg["content"][0]["type"] == "text" - assert user_msg["content"][0]["text"] == "What color is this image?" - assert user_msg["content"][1]["type"] == "image_url" - - # Clean up - agent.dispose() - - -@pytest.mark.tofix -def test_agent_multiple_images() -> None: - """Test agent with multiple images in AgentMessage.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - # Create agent - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful vision assistant that compares images.", - temperature=0.0, - seed=42, - ) - - # Create AgentMessage with multiple images - msg = AgentMessage() - msg.add_text("Compare these three images.") - msg.add_text("What are their colors?") - - # Create three different colored images - red_img = Image(data=np.full((50, 50, 3), [255, 0, 0], dtype=np.uint8)) - green_img = Image(data=np.full((50, 50, 3), [0, 255, 0], dtype=np.uint8)) - blue_img = Image(data=np.full((50, 50, 3), [0, 0, 255], dtype=np.uint8)) - - msg.add_image(red_img) - msg.add_image(green_img) - msg.add_image(blue_img) - - # Query - response = agent.query(msg) - - # Verify response acknowledges the images - response_lower = response.content.lower() - # Check if the model is actually seeing the images - if "unable to view" in response_lower or "can't see" in response_lower: - print(f"WARNING: Model not seeing images: {response.content}") - # Still pass the test but note the issue - else: - # If the model can see images, it should mention some colors - colors_mentioned = sum( - 1 - for color in ["red", "green", "blue", "color", "image", "bright", "dark"] - if color in response_lower - ) - assert colors_mentioned >= 1, ( - f"Expected color/image references, found none in: {response.content}" - ) - - # Check history structure - history = agent.conversation.to_openai_format() - user_msg = history[0] - assert user_msg["role"] == "user" - assert isinstance(user_msg["content"], list) - assert len(user_msg["content"]) == 4 # 1 text + 3 images - assert user_msg["content"][0]["type"] == "text" - assert user_msg["content"][0]["text"] == "Compare these three images. What are their colors?" - - # Verify all images are in the message - for i in range(1, 4): - assert user_msg["content"][i]["type"] == "image_url" - assert user_msg["content"][i]["image_url"]["url"].startswith("data:image/jpeg;base64,") - - # Clean up - agent.dispose() - - -@pytest.mark.tofix -def test_agent_image_with_context() -> None: - """Test agent maintaining context with image queries.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - # Create agent - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful vision assistant with good memory.", - temperature=0.0, - seed=42, - ) - - # First query with image - msg1 = AgentMessage() - msg1.add_text("This is my favorite color.") - msg1.add_text("Remember it.") - - # Create purple image - purple_img = Image(data=np.full((80, 80, 3), [128, 0, 128], dtype=np.uint8)) - msg1.add_image(purple_img) - - response1 = agent.query(msg1) - # The model should acknowledge the color or mention the image - assert any( - word in response1.content.lower() - for word in ["purple", "violet", "color", "image", "magenta"] - ), f"Expected color or image reference in response: {response1.content}" - - # Second query without image, referencing the first - response2 = agent.query("What was my favorite color that I showed you?") - # Check if the model acknowledges the previous conversation - response_lower = response2.content.lower() - logger.info(f"Response: {response2.content}") - assert any( - word in response_lower - for word in ["purple", "violet", "color", "favorite", "showed", "image"] - ), f"Agent should reference previous conversation: {response2.content}" - - # Check conversation history has all messages - assert agent.conversation.size() == 4 - - # Clean up - agent.dispose() - - -@pytest.mark.tofix -def test_agent_mixed_content() -> None: - """Test agent with mixed text-only and image queries.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - # Create agent - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant that can see images when provided.", - temperature=0.0, - seed=100, - ) - - # Text-only query - response1 = agent.query("Hello! Can you see images?") - assert response1.content is not None - - # Image query - msg2 = AgentMessage() - msg2.add_text("Now look at this image.") - msg2.add_text("What do you see? Describe the scene.") - - # Use first frame from rgbd_frames test data - import numpy as np - from PIL import Image as PILImage - - from dimos.msgs.sensor_msgs import Image - from dimos.utils.data import get_data - - data_path = get_data("rgbd_frames") - image_path = os.path.join(data_path, "color", "00000.png") - - pil_image = PILImage.open(image_path) - image_array = np.array(pil_image) - - image = Image.from_numpy(image_array) - - msg2.add_image(image) - - # Check image encoding - logger.info(f"Image shape: {image.data.shape}") - logger.info(f"Image encoding: {len(image.agent_encode())} chars") - - response2 = agent.query(msg2) - logger.info(f"Image query response: {response2.content}") - logger.info(f"Agent supports vision: {agent._supports_vision}") - logger.info(f"Message has images: {msg2.has_images()}") - logger.info(f"Number of images in message: {len(msg2.images)}") - # Check that the model saw and described the image - assert any( - word in response2.content.lower() - for word in ["desk", "chair", "table", "laptop", "computer", "screen", "monitor"] - ), f"Expected description of office scene, got: {response2.content}" - - # Another text-only query - response3 = agent.query("What did I just show you?") - words = ["office", "room", "hallway", "image", "scene"] - content = response3.content.lower() - - assert any(word in content for word in words), f"{content=}" - - # Check history structure - assert agent.conversation.size() == 6 - history = agent.conversation.to_openai_format() - # First query should be simple string - assert isinstance(history[0]["content"], str) - # Second query should be content array - assert isinstance(history[2]["content"], list) - # Third query should be simple string again - assert isinstance(history[4]["content"], str) - - # Clean up - agent.dispose() - - -@pytest.mark.tofix -def test_agent_empty_image_message() -> None: - """Test edge case with empty parts of AgentMessage.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - # Create agent - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant.", - temperature=0.0, - seed=42, - ) - - # AgentMessage with only images, no text - msg = AgentMessage() - # Don't add any text - - # Add a simple colored image - img = Image(data=np.full((60, 60, 3), [255, 255, 0], dtype=np.uint8)) # Yellow - msg.add_image(img) - - response = agent.query(msg) - # Should still work even without text - assert response.content is not None - assert len(response.content) > 0 - - # AgentMessage with empty text parts - msg2 = AgentMessage() - msg2.add_text("") # Empty - msg2.add_text("What") - msg2.add_text("") # Empty - msg2.add_text("color?") - msg2.add_image(img) - - response2 = agent.query(msg2) - # Accept various color interpretations for yellow (RGB 255,255,0) - response_lower = response2.content.lower() - assert any( - color in response_lower for color in ["yellow", "color", "bright", "turquoise", "green"] - ), f"Expected color reference in response: {response2.content}" - - # Clean up - agent.dispose() - - -@pytest.mark.tofix -def test_agent_non_vision_model_with_images() -> None: - """Test that non-vision models handle image input gracefully.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - # Create agent with non-vision model - agent = BaseAgent( - model="openai::gpt-3.5-turbo", # This model doesn't support vision - system_prompt="You are a helpful assistant.", - temperature=0.0, - seed=42, - ) - - # Try to send an image - msg = AgentMessage() - msg.add_text("What do you see in this image?") - - img = Image(data=np.zeros((100, 100, 3), dtype=np.uint8)) - msg.add_image(img) - - # Should log warning and process as text-only - response = agent.query(msg) - assert response.content is not None - - # Check history - should be text-only - history = agent.conversation.to_openai_format() - user_msg = history[0] - assert isinstance(user_msg["content"], str), "Non-vision model should store text-only" - assert user_msg["content"] == "What do you see in this image?" - - # Clean up - agent.dispose() - - -@pytest.mark.tofix -def test_mock_agent_with_images() -> None: - """Test mock agent with images for CI.""" - # This test doesn't need API keys - - from dimos.agents.test_base_agent_text import MockAgent - - # Create mock agent - agent = MockAgent(model="mock::vision", system_prompt="Mock vision agent") - agent._supports_vision = True # Enable vision support - - # Test with image - msg = AgentMessage() - msg.add_text("What color is this?") - - img = Image(data=np.zeros((50, 50, 3), dtype=np.uint8)) - msg.add_image(img) - - response = agent.query(msg) - assert response.content is not None - assert "Mock response" in response.content or "color" in response.content - - # Check conversation history - assert agent.conversation.size() == 2 - - # Clean up - agent.dispose() diff --git a/dimos/agents/test_agent_message_streams.py b/dimos/agents/test_agent_message_streams.py deleted file mode 100644 index 22d33b46de..0000000000 --- a/dimos/agents/test_agent_message_streams.py +++ /dev/null @@ -1,387 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Test BaseAgent with AgentMessage and video streams.""" - -import asyncio -import os -import pickle - -from dotenv import load_dotenv -import pytest -from reactivex import operators as ops - -from dimos import core -from dimos.agents.agent_message import AgentMessage -from dimos.agents.agent_types import AgentResponse -from dimos.agents.modules.base_agent import BaseAgentModule -from dimos.core import In, Module, Out, rpc -from dimos.msgs.sensor_msgs import Image -from dimos.protocol import pubsub -from dimos.utils.data import get_data -from dimos.utils.logging_config import setup_logger -from dimos.utils.testing import TimedSensorReplay - -logger = setup_logger("test_agent_message_streams") - - -class VideoMessageSender(Module): - """Module that sends AgentMessage with video frames every 2 seconds.""" - - message_out: Out[AgentMessage] = None - - def __init__(self, video_path: str) -> None: - super().__init__() - self.video_path = video_path - self._subscription = None - self._frame_count = 0 - - @rpc - def start(self) -> None: - """Start sending video messages.""" - # Use TimedSensorReplay to replay video frames - video_replay = TimedSensorReplay(self.video_path, autocast=Image.from_numpy) - - # Send AgentMessage with frame every 3 seconds (give agent more time to process) - self._subscription = ( - video_replay.stream() - .pipe( - ops.sample(3.0), # Every 3 seconds - ops.take(3), # Only send 3 frames total - ops.map(self._create_message), - ) - .subscribe( - on_next=lambda msg: self._send_message(msg), - on_error=lambda e: logger.error(f"Video stream error: {e}"), - on_completed=lambda: logger.info("Video stream completed"), - ) - ) - - logger.info("Video message streaming started (every 3 seconds, max 3 frames)") - - def _create_message(self, frame: Image) -> AgentMessage: - """Create AgentMessage with frame and query.""" - self._frame_count += 1 - - msg = AgentMessage() - msg.add_text(f"What do you see in frame {self._frame_count}? Describe in one sentence.") - msg.add_image(frame) - - logger.info(f"Created message with frame {self._frame_count}") - return msg - - def _send_message(self, msg: AgentMessage) -> None: - """Send the message and test pickling.""" - # Test that message can be pickled (for module communication) - try: - pickled = pickle.dumps(msg) - pickle.loads(pickled) - logger.info(f"Message pickling test passed - size: {len(pickled)} bytes") - except Exception as e: - logger.error(f"Message pickling failed: {e}") - - self.message_out.publish(msg) - - @rpc - def stop(self) -> None: - """Stop streaming.""" - if self._subscription: - self._subscription.dispose() - self._subscription = None - - -class MultiImageMessageSender(Module): - """Send AgentMessage with multiple images.""" - - message_out: Out[AgentMessage] = None - - def __init__(self, video_path: str) -> None: - super().__init__() - self.video_path = video_path - self.frames = [] - - @rpc - def start(self) -> None: - """Collect some frames.""" - video_replay = TimedSensorReplay(self.video_path, autocast=Image.from_numpy) - - # Collect first 3 frames - video_replay.stream().pipe(ops.take(3)).subscribe( - on_next=lambda frame: self.frames.append(frame), - on_completed=self._send_multi_image_query, - ) - - def _send_multi_image_query(self) -> None: - """Send query with multiple images.""" - if len(self.frames) >= 2: - msg = AgentMessage() - msg.add_text("Compare these images and describe what changed between them.") - - for _i, frame in enumerate(self.frames[:2]): - msg.add_image(frame) - - logger.info(f"Sending multi-image message with {len(msg.images)} images") - - # Test pickling - try: - pickled = pickle.dumps(msg) - logger.info(f"Multi-image message pickle size: {len(pickled)} bytes") - except Exception as e: - logger.error(f"Multi-image pickling failed: {e}") - - self.message_out.publish(msg) - - -class ResponseCollector(Module): - """Collect responses.""" - - response_in: In[AgentResponse] = None - - def __init__(self) -> None: - super().__init__() - self.responses = [] - - @rpc - def start(self) -> None: - self.response_in.subscribe(self._on_response) - - def _on_response(self, resp: AgentResponse) -> None: - logger.info(f"Collected response: {resp.content[:100] if resp.content else 'None'}...") - self.responses.append(resp) - - @rpc - def get_responses(self): - return self.responses - - -@pytest.mark.tofix -@pytest.mark.module -@pytest.mark.asyncio -async def test_agent_message_video_stream() -> None: - """Test BaseAgentModule with AgentMessage containing video frames.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - pubsub.lcm.autoconf() - - logger.info("Testing BaseAgentModule with AgentMessage video stream...") - dimos = core.start(4) - - try: - # Get test video - data_path = get_data("unitree_office_walk") - video_path = os.path.join(data_path, "video") - - logger.info(f"Using video from: {video_path}") - - # Deploy modules - video_sender = dimos.deploy(VideoMessageSender, video_path) - video_sender.message_out.transport = core.pLCMTransport("/agent/message") - - agent = dimos.deploy( - BaseAgentModule, - model="openai::gpt-4o-mini", - system_prompt="You are a vision assistant. Describe what you see concisely.", - temperature=0.0, - ) - agent.response_out.transport = core.pLCMTransport("/agent/response") - - collector = dimos.deploy(ResponseCollector) - - # Connect modules - agent.message_in.connect(video_sender.message_out) - collector.response_in.connect(agent.response_out) - - # Start modules - agent.start() - collector.start() - video_sender.start() - - logger.info("All modules started, streaming video messages...") - - # Wait for 3 messages to be sent (3 frames * 3 seconds = 9 seconds) - # Plus processing time, wait 12 seconds total - await asyncio.sleep(12) - - # Stop video stream - video_sender.stop() - - # Get all responses - responses = collector.get_responses() - logger.info(f"\nCollected {len(responses)} responses:") - for i, resp in enumerate(responses): - logger.info( - f"\nResponse {i + 1}: {resp.content if isinstance(resp, AgentResponse) else resp}" - ) - - # Verify we got at least 2 responses (sometimes the 3rd frame doesn't get processed in time) - assert len(responses) >= 2, f"Expected at least 2 responses, got {len(responses)}" - - # Verify responses describe actual scene - all_responses = " ".join( - resp.content if isinstance(resp, AgentResponse) else resp for resp in responses - ).lower() - assert any( - word in all_responses - for word in ["office", "room", "hallway", "corridor", "door", "wall", "floor", "frame"] - ), "Responses should describe the office environment" - - logger.info("\n✅ AgentMessage video stream test PASSED!") - - # Stop agent - agent.stop() - - finally: - dimos.close() - dimos.shutdown() - - -@pytest.mark.tofix -@pytest.mark.module -@pytest.mark.asyncio -async def test_agent_message_multi_image() -> None: - """Test BaseAgentModule with AgentMessage containing multiple images.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - pubsub.lcm.autoconf() - - logger.info("Testing BaseAgentModule with multi-image AgentMessage...") - dimos = core.start(4) - - try: - # Get test video - data_path = get_data("unitree_office_walk") - video_path = os.path.join(data_path, "video") - - # Deploy modules - multi_sender = dimos.deploy(MultiImageMessageSender, video_path) - multi_sender.message_out.transport = core.pLCMTransport("/agent/multi_message") - - agent = dimos.deploy( - BaseAgentModule, - model="openai::gpt-4o-mini", - system_prompt="You are a vision assistant that compares images.", - temperature=0.0, - ) - agent.response_out.transport = core.pLCMTransport("/agent/multi_response") - - collector = dimos.deploy(ResponseCollector) - - # Connect modules - agent.message_in.connect(multi_sender.message_out) - collector.response_in.connect(agent.response_out) - - # Start modules - agent.start() - collector.start() - multi_sender.start() - - logger.info("Modules started, sending multi-image query...") - - # Wait for response - await asyncio.sleep(8) - - # Get responses - responses = collector.get_responses() - logger.info(f"\nCollected {len(responses)} responses:") - for i, resp in enumerate(responses): - logger.info( - f"\nResponse {i + 1}: {resp.content if isinstance(resp, AgentResponse) else resp}" - ) - - # Verify we got a response - assert len(responses) >= 1, f"Expected at least 1 response, got {len(responses)}" - - # Response should mention comparison or multiple images - response_text = ( - responses[0].content if isinstance(responses[0], AgentResponse) else responses[0] - ).lower() - assert any( - word in response_text - for word in ["both", "first", "second", "change", "different", "similar", "compare"] - ), "Response should indicate comparison of multiple images" - - logger.info("\n✅ Multi-image AgentMessage test PASSED!") - - # Stop agent - agent.stop() - - finally: - dimos.close() - dimos.shutdown() - - -@pytest.mark.tofix -def test_agent_message_text_only() -> None: - """Test BaseAgent with text-only AgentMessage.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - from dimos.agents.modules.base import BaseAgent - - # Create agent - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant. Answer in 10 words or less.", - temperature=0.0, - seed=42, - ) - - # Test with text-only AgentMessage - msg = AgentMessage() - msg.add_text("What is") - msg.add_text("the capital") - msg.add_text("of France?") - - response = agent.query(msg) - assert "Paris" in response.content, "Expected 'Paris' in response" - - # Test pickling of AgentMessage - pickled = pickle.dumps(msg) - unpickled = pickle.loads(pickled) - assert unpickled.get_combined_text() == "What is the capital of France?" - - # Verify multiple text messages were combined properly - assert len(msg.messages) == 3 - assert msg.messages[0] == "What is" - assert msg.messages[1] == "the capital" - assert msg.messages[2] == "of France?" - - logger.info("✅ Text-only AgentMessage test PASSED!") - - # Clean up - agent.dispose() - - -if __name__ == "__main__": - logger.info("Running AgentMessage stream tests...") - - # Run text-only test first - test_agent_message_text_only() - print("\n" + "=" * 60 + "\n") - - # Run async tests - asyncio.run(test_agent_message_video_stream()) - print("\n" + "=" * 60 + "\n") - asyncio.run(test_agent_message_multi_image()) - - logger.info("\n✅ All AgentMessage tests completed!") diff --git a/dimos/agents/test_agent_pool.py b/dimos/agents/test_agent_pool.py deleted file mode 100644 index b3576b80e2..0000000000 --- a/dimos/agents/test_agent_pool.py +++ /dev/null @@ -1,353 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Test agent pool module.""" - -import asyncio -import os - -from dotenv import load_dotenv -import pytest - -from dimos import core -from dimos.agents.modules.base_agent import BaseAgentModule -from dimos.core import In, Module, Out, rpc -from dimos.protocol import pubsub - - -class PoolRouter(Module): - """Simple router for agent pool.""" - - query_in: In[dict] = None - agent1_out: Out[str] = None - agent2_out: Out[str] = None - agent3_out: Out[str] = None - - @rpc - def start(self) -> None: - self.query_in.subscribe(self._route) - - def _route(self, msg: dict) -> None: - agent_id = msg.get("agent_id", "agent1") - query = msg.get("query", "") - - if agent_id == "agent1" and self.agent1_out: - self.agent1_out.publish(query) - elif agent_id == "agent2" and self.agent2_out: - self.agent2_out.publish(query) - elif agent_id == "agent3" and self.agent3_out: - self.agent3_out.publish(query) - elif agent_id == "all": - # Broadcast to all - if self.agent1_out: - self.agent1_out.publish(query) - if self.agent2_out: - self.agent2_out.publish(query) - if self.agent3_out: - self.agent3_out.publish(query) - - -class PoolAggregator(Module): - """Aggregate responses from pool.""" - - agent1_in: In[str] = None - agent2_in: In[str] = None - agent3_in: In[str] = None - response_out: Out[dict] = None - - @rpc - def start(self) -> None: - if self.agent1_in: - self.agent1_in.subscribe(lambda r: self._handle_response("agent1", r)) - if self.agent2_in: - self.agent2_in.subscribe(lambda r: self._handle_response("agent2", r)) - if self.agent3_in: - self.agent3_in.subscribe(lambda r: self._handle_response("agent3", r)) - - def _handle_response(self, agent_id: str, response: str) -> None: - if self.response_out: - self.response_out.publish({"agent_id": agent_id, "response": response}) - - -class PoolController(Module): - """Controller for pool testing.""" - - query_out: Out[dict] = None - - @rpc - def send_to_agent(self, agent_id: str, query: str) -> None: - self.query_out.publish({"agent_id": agent_id, "query": query}) - - @rpc - def broadcast(self, query: str) -> None: - self.query_out.publish({"agent_id": "all", "query": query}) - - -class PoolCollector(Module): - """Collect pool responses.""" - - response_in: In[dict] = None - - def __init__(self) -> None: - super().__init__() - self.responses = [] - - @rpc - def start(self) -> None: - self.response_in.subscribe(lambda r: self.responses.append(r)) - - @rpc - def get_responses(self) -> list: - return self.responses - - @rpc - def get_by_agent(self, agent_id: str) -> list: - return [r for r in self.responses if r.get("agent_id") == agent_id] - - -@pytest.mark.skip("Skipping pool tests for now") -@pytest.mark.module -@pytest.mark.asyncio -async def test_agent_pool() -> None: - """Test agent pool with multiple agents.""" - load_dotenv() - pubsub.lcm.autoconf() - - # Check for at least one API key - has_api_key = any( - [os.getenv("OPENAI_API_KEY"), os.getenv("ANTHROPIC_API_KEY"), os.getenv("CEREBRAS_API_KEY")] - ) - - if not has_api_key: - pytest.skip("No API keys found for testing") - - dimos = core.start(7) - - try: - # Deploy three agents with different configs - agents = [] - models = [] - - if os.getenv("CEREBRAS_API_KEY"): - agent1 = dimos.deploy( - BaseAgentModule, - model="cerebras::llama3.1-8b", - system_prompt="You are agent1. Be very brief.", - ) - agents.append(agent1) - models.append("agent1") - - if os.getenv("OPENAI_API_KEY"): - agent2 = dimos.deploy( - BaseAgentModule, - model="openai::gpt-4o-mini", - system_prompt="You are agent2. Be helpful.", - ) - agents.append(agent2) - models.append("agent2") - - if os.getenv("CEREBRAS_API_KEY") and len(agents) < 3: - agent3 = dimos.deploy( - BaseAgentModule, - model="cerebras::llama3.1-8b", - system_prompt="You are agent3. Be creative.", - ) - agents.append(agent3) - models.append("agent3") - - if len(agents) < 2: - pytest.skip("Need at least 2 working agents for pool test") - - # Deploy router, aggregator, controller, collector - router = dimos.deploy(PoolRouter) - aggregator = dimos.deploy(PoolAggregator) - controller = dimos.deploy(PoolController) - collector = dimos.deploy(PoolCollector) - - # Configure transports - controller.query_out.transport = core.pLCMTransport("/pool/queries") - aggregator.response_out.transport = core.pLCMTransport("/pool/responses") - - # Configure agent transports and connections - if len(agents) > 0: - router.agent1_out.transport = core.pLCMTransport("/pool/agent1/query") - agents[0].response_out.transport = core.pLCMTransport("/pool/agent1/response") - agents[0].query_in.connect(router.agent1_out) - aggregator.agent1_in.connect(agents[0].response_out) - - if len(agents) > 1: - router.agent2_out.transport = core.pLCMTransport("/pool/agent2/query") - agents[1].response_out.transport = core.pLCMTransport("/pool/agent2/response") - agents[1].query_in.connect(router.agent2_out) - aggregator.agent2_in.connect(agents[1].response_out) - - if len(agents) > 2: - router.agent3_out.transport = core.pLCMTransport("/pool/agent3/query") - agents[2].response_out.transport = core.pLCMTransport("/pool/agent3/response") - agents[2].query_in.connect(router.agent3_out) - aggregator.agent3_in.connect(agents[2].response_out) - - # Connect router and collector - router.query_in.connect(controller.query_out) - collector.response_in.connect(aggregator.response_out) - - # Start all modules - for agent in agents: - agent.start() - router.start() - aggregator.start() - collector.start() - - await asyncio.sleep(3) - - # Test direct routing - for _i, model_id in enumerate(models[:2]): # Test first 2 agents - controller.send_to_agent(model_id, f"Say hello from {model_id}") - await asyncio.sleep(0.5) - - await asyncio.sleep(6) - - responses = collector.get_responses() - print(f"Got {len(responses)} responses from direct routing") - assert len(responses) >= len(models[:2]), ( - f"Should get responses from at least {len(models[:2])} agents" - ) - - # Test broadcast - collector.responses.clear() - controller.broadcast("What is 1+1?") - - await asyncio.sleep(6) - - responses = collector.get_responses() - print(f"Got {len(responses)} responses from broadcast (expected {len(agents)})") - # Allow for some agents to be slow - assert len(responses) >= min(2, len(agents)), ( - f"Should get response from at least {min(2, len(agents))} agents" - ) - - # Check all agents responded - agent_ids = {r["agent_id"] for r in responses} - assert len(agent_ids) >= 2, "Multiple agents should respond" - - # Stop all agents - for agent in agents: - agent.stop() - - finally: - dimos.close() - dimos.shutdown() - - -@pytest.mark.skip("Skipping pool tests for now") -@pytest.mark.module -@pytest.mark.asyncio -async def test_mock_agent_pool() -> None: - """Test agent pool with mock agents.""" - pubsub.lcm.autoconf() - - class MockPoolAgent(Module): - """Mock agent for pool testing.""" - - query_in: In[str] = None - response_out: Out[str] = None - - def __init__(self, agent_id: str) -> None: - super().__init__() - self.agent_id = agent_id - - @rpc - def start(self) -> None: - self.query_in.subscribe(self._handle_query) - - def _handle_query(self, query: str) -> None: - if "1+1" in query: - self.response_out.publish(f"{self.agent_id}: The answer is 2") - else: - self.response_out.publish(f"{self.agent_id}: {query}") - - dimos = core.start(6) - - try: - # Deploy mock agents - agent1 = dimos.deploy(MockPoolAgent, agent_id="fast") - agent2 = dimos.deploy(MockPoolAgent, agent_id="smart") - agent3 = dimos.deploy(MockPoolAgent, agent_id="creative") - - # Deploy infrastructure - router = dimos.deploy(PoolRouter) - aggregator = dimos.deploy(PoolAggregator) - collector = dimos.deploy(PoolCollector) - - # Configure all transports - router.query_in.transport = core.pLCMTransport("/mock/pool/queries") - router.agent1_out.transport = core.pLCMTransport("/mock/pool/agent1/q") - router.agent2_out.transport = core.pLCMTransport("/mock/pool/agent2/q") - router.agent3_out.transport = core.pLCMTransport("/mock/pool/agent3/q") - - agent1.response_out.transport = core.pLCMTransport("/mock/pool/agent1/r") - agent2.response_out.transport = core.pLCMTransport("/mock/pool/agent2/r") - agent3.response_out.transport = core.pLCMTransport("/mock/pool/agent3/r") - - aggregator.response_out.transport = core.pLCMTransport("/mock/pool/responses") - - # Connect everything - agent1.query_in.connect(router.agent1_out) - agent2.query_in.connect(router.agent2_out) - agent3.query_in.connect(router.agent3_out) - - aggregator.agent1_in.connect(agent1.response_out) - aggregator.agent2_in.connect(agent2.response_out) - aggregator.agent3_in.connect(agent3.response_out) - - collector.response_in.connect(aggregator.response_out) - - # Start all - agent1.start() - agent2.start() - agent3.start() - router.start() - aggregator.start() - collector.start() - - await asyncio.sleep(0.5) - - # Test routing - router.query_in.transport.publish({"agent_id": "agent1", "query": "Hello"}) - router.query_in.transport.publish({"agent_id": "agent2", "query": "Hi"}) - - await asyncio.sleep(0.5) - - responses = collector.get_responses() - assert len(responses) == 2 - assert any("fast" in r["response"] for r in responses) - assert any("smart" in r["response"] for r in responses) - - # Test broadcast - collector.responses.clear() - router.query_in.transport.publish({"agent_id": "all", "query": "What is 1+1?"}) - - await asyncio.sleep(0.5) - - responses = collector.get_responses() - assert len(responses) == 3 - assert all("2" in r["response"] for r in responses) - - finally: - dimos.close() - dimos.shutdown() - - -if __name__ == "__main__": - asyncio.run(test_mock_agent_pool()) diff --git a/dimos/agents/test_agent_tools.py b/dimos/agents/test_agent_tools.py deleted file mode 100644 index fd485ac015..0000000000 --- a/dimos/agents/test_agent_tools.py +++ /dev/null @@ -1,409 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Production test for BaseAgent tool handling functionality.""" - -import asyncio -import os - -from dotenv import load_dotenv -from pydantic import Field -import pytest - -from dimos import core -from dimos.agents.agent_message import AgentMessage -from dimos.agents.agent_types import AgentResponse -from dimos.agents.modules.base import BaseAgent -from dimos.agents.modules.base_agent import BaseAgentModule -from dimos.core import In, Module, Out, rpc -from dimos.protocol import pubsub -from dimos.skills.skills import AbstractSkill, SkillLibrary -from dimos.utils.logging_config import setup_logger - -logger = setup_logger("test_agent_tools") - - -# Test Skills -class CalculateSkill(AbstractSkill): - """Perform a calculation.""" - - expression: str = Field(description="Mathematical expression to evaluate") - - def __call__(self) -> str: - try: - # Simple evaluation for testing - result = eval(self.expression) - return f"The result is {result}" - except Exception as e: - return f"Error calculating: {e!s}" - - -class WeatherSkill(AbstractSkill): - """Get current weather information for a location. This is a mock weather service that returns test data.""" - - location: str = Field(description="Location to get weather for (e.g. 'London', 'New York')") - - def __call__(self) -> str: - # Mock weather response - return f"The weather in {self.location} is sunny with a temperature of 72°F" - - -class NavigationSkill(AbstractSkill): - """Navigate to a location (potentially long-running).""" - - destination: str = Field(description="Destination to navigate to") - speed: float = Field(default=1.0, description="Navigation speed in m/s") - - def __call__(self) -> str: - # In real implementation, this would start navigation - # For now, simulate blocking behavior - import time - - time.sleep(0.5) # Simulate some processing - return f"Navigation to {self.destination} completed successfully" - - -# Module for testing tool execution -class ToolTestController(Module): - """Controller that sends queries to agent.""" - - message_out: Out[AgentMessage] = None - - @rpc - def send_query(self, query: str) -> None: - msg = AgentMessage() - msg.add_text(query) - self.message_out.publish(msg) - - -class ResponseCollector(Module): - """Collect agent responses.""" - - response_in: In[AgentResponse] = None - - def __init__(self) -> None: - super().__init__() - self.responses = [] - - @rpc - def start(self) -> None: - logger.info("ResponseCollector starting subscription") - self.response_in.subscribe(self._on_response) - logger.info("ResponseCollector subscription active") - - def _on_response(self, response) -> None: - logger.info(f"ResponseCollector received response #{len(self.responses) + 1}: {response}") - self.responses.append(response) - - @rpc - def get_responses(self): - return self.responses - - -@pytest.mark.tofix -@pytest.mark.module -@pytest.mark.asyncio -async def test_agent_module_with_tools() -> None: - """Test BaseAgentModule with tool execution.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - pubsub.lcm.autoconf() - dimos = core.start(4) - - try: - # Create skill library - skill_library = SkillLibrary() - skill_library.add(CalculateSkill) - skill_library.add(WeatherSkill) - skill_library.add(NavigationSkill) - - # Deploy modules - controller = dimos.deploy(ToolTestController) - controller.message_out.transport = core.pLCMTransport("/tools/messages") - - agent = dimos.deploy( - BaseAgentModule, - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant with access to calculation, weather, and navigation tools. When asked about weather, you MUST use the WeatherSkill tool - it provides mock weather data for testing. When asked to navigate somewhere, you MUST use the NavigationSkill tool. Always use the appropriate tool when available.", - skills=skill_library, - temperature=0.0, - memory=False, - ) - agent.response_out.transport = core.pLCMTransport("/tools/responses") - - collector = dimos.deploy(ResponseCollector) - - # Connect modules - agent.message_in.connect(controller.message_out) - collector.response_in.connect(agent.response_out) - - # Start modules - agent.start() - collector.start() - - # Wait for initialization - await asyncio.sleep(1) - - # Test 1: Calculation (fast tool) - logger.info("\n=== Test 1: Calculation Tool ===") - controller.send_query("Use the calculate tool to compute 42 * 17") - await asyncio.sleep(5) # Give more time for the response - - responses = collector.get_responses() - logger.info(f"Got {len(responses)} responses after first query") - assert len(responses) >= 1, ( - f"Should have received at least one response, got {len(responses)}" - ) - - response = responses[-1] - logger.info(f"Response: {response}") - - # Verify the calculation result - assert isinstance(response, AgentResponse), "Expected AgentResponse object" - assert "714" in response.content, f"Expected '714' in response, got: {response.content}" - - # Test 2: Weather query (fast tool) - logger.info("\n=== Test 2: Weather Tool ===") - controller.send_query("What's the weather in New York?") - await asyncio.sleep(5) # Give more time for the second response - - responses = collector.get_responses() - assert len(responses) >= 2, "Should have received at least two responses" - - response = responses[-1] - logger.info(f"Response: {response}") - - # Verify weather details - assert isinstance(response, AgentResponse), "Expected AgentResponse object" - assert "new york" in response.content.lower(), "Expected 'New York' in response" - assert "72" in response.content, "Expected temperature '72' in response" - assert "sunny" in response.content.lower(), "Expected 'sunny' in response" - - # Test 3: Navigation (potentially long-running) - logger.info("\n=== Test 3: Navigation Tool ===") - controller.send_query("Use the NavigationSkill to navigate to the kitchen") - await asyncio.sleep(6) # Give more time for navigation tool to complete - - responses = collector.get_responses() - logger.info(f"Total responses collected: {len(responses)}") - for i, r in enumerate(responses): - logger.info(f" Response {i + 1}: {r.content[:50]}...") - assert len(responses) >= 3, ( - f"Should have received at least three responses, got {len(responses)}" - ) - - response = responses[-1] - logger.info(f"Response: {response}") - - # Verify navigation response - assert isinstance(response, AgentResponse), "Expected AgentResponse object" - assert "kitchen" in response.content.lower(), "Expected 'kitchen' in response" - - # Check if NavigationSkill was called - if response.tool_calls is not None and len(response.tool_calls) > 0: - # Tool was called - verify it - assert any(tc.name == "NavigationSkill" for tc in response.tool_calls), ( - "Expected NavigationSkill to be called" - ) - logger.info("✓ NavigationSkill was called") - else: - # Tool wasn't called - just verify response mentions navigation - logger.info("Note: NavigationSkill was not called, agent gave instructions instead") - - # Stop agent - agent.stop() - - # Print summary - logger.info("\n=== Test Summary ===") - all_responses = collector.get_responses() - for i, resp in enumerate(all_responses): - logger.info( - f"Response {i + 1}: {resp.content if isinstance(resp, AgentResponse) else resp}" - ) - - finally: - dimos.close() - dimos.shutdown() - - -@pytest.mark.tofix -def test_base_agent_direct_tools() -> None: - """Test BaseAgent direct usage with tools.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - # Create skill library - skill_library = SkillLibrary() - skill_library.add(CalculateSkill) - skill_library.add(WeatherSkill) - - # Create agent with skills - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant with access to a calculator tool. When asked to calculate something, you should use the CalculateSkill tool.", - skills=skill_library, - temperature=0.0, - memory=False, - seed=42, - ) - - # Test calculation with explicit tool request - logger.info("\n=== Direct Test 1: Calculation Tool ===") - response = agent.query("Calculate 144**0.5") - - logger.info(f"Response content: {response.content}") - logger.info(f"Tool calls: {response.tool_calls}") - - assert response.content is not None - assert "12" in response.content or "twelve" in response.content.lower(), ( - f"Expected '12' in response, got: {response.content}" - ) - - # Verify tool was called OR answer is correct - if response.tool_calls is not None: - assert len(response.tool_calls) > 0, "Expected at least one tool call" - assert response.tool_calls[0].name == "CalculateSkill", ( - f"Expected CalculateSkill, got: {response.tool_calls[0].name}" - ) - assert response.tool_calls[0].status == "completed", ( - f"Expected completed status, got: {response.tool_calls[0].status}" - ) - logger.info("✓ Tool was called successfully") - else: - logger.warning("Tool was not called - agent answered directly") - - # Test weather tool - logger.info("\n=== Direct Test 2: Weather Tool ===") - response2 = agent.query("Use the WeatherSkill to check the weather in London") - - logger.info(f"Response content: {response2.content}") - logger.info(f"Tool calls: {response2.tool_calls}") - - assert response2.content is not None - assert "london" in response2.content.lower(), "Expected 'London' in response" - assert "72" in response2.content, "Expected temperature '72' in response" - assert "sunny" in response2.content.lower(), "Expected 'sunny' in response" - - # Verify tool was called - if response2.tool_calls is not None: - assert len(response2.tool_calls) > 0, "Expected at least one tool call" - assert response2.tool_calls[0].name == "WeatherSkill", ( - f"Expected WeatherSkill, got: {response2.tool_calls[0].name}" - ) - logger.info("✓ Weather tool was called successfully") - else: - logger.warning("Weather tool was not called - agent answered directly") - - # Clean up - agent.dispose() - - -class MockToolAgent(BaseAgent): - """Mock agent for CI testing without API calls.""" - - def __init__(self, **kwargs) -> None: - # Skip gateway initialization - self.model = kwargs.get("model", "mock::test") - self.system_prompt = kwargs.get("system_prompt", "Mock agent") - self.skills = kwargs.get("skills", SkillLibrary()) - self.history = [] - self._history_lock = __import__("threading").Lock() - self._supports_vision = False - self.response_subject = None - self.gateway = None - self._executor = None - - async def _process_query_async(self, agent_msg, base64_image=None, base64_images=None): - """Mock tool execution.""" - from dimos.agents.agent_message import AgentMessage - from dimos.agents.agent_types import AgentResponse, ToolCall - - # Get text from AgentMessage - if isinstance(agent_msg, AgentMessage): - query = agent_msg.get_combined_text() - else: - query = str(agent_msg) - - # Simple pattern matching for tools - if "calculate" in query.lower(): - # Extract expression - import re - - match = re.search(r"(\d+\s*[\+\-\*/]\s*\d+)", query) - if match: - expr = match.group(1) - tool_call = ToolCall( - id="mock_calc_1", - name="CalculateSkill", - arguments={"expression": expr}, - status="completed", - ) - # Execute the tool - result = self.skills.call("CalculateSkill", expression=expr) - return AgentResponse( - content=f"I calculated {expr} and {result}", tool_calls=[tool_call] - ) - - # Default response - return AgentResponse(content=f"Mock response to: {query}") - - def dispose(self) -> None: - pass - - -@pytest.mark.tofix -def test_mock_agent_tools() -> None: - """Test mock agent with tools for CI.""" - # Create skill library - skill_library = SkillLibrary() - skill_library.add(CalculateSkill) - - # Create mock agent - agent = MockToolAgent(model="mock::test", skills=skill_library) - - # Test calculation - logger.info("\n=== Mock Test: Calculation ===") - response = agent.query("Calculate 25 + 17") - - logger.info(f"Mock response: {response.content}") - logger.info(f"Mock tool calls: {response.tool_calls}") - - assert response.content is not None - assert "42" in response.content, "Expected '42' in response" - assert response.tool_calls is not None, "Expected tool calls" - assert len(response.tool_calls) == 1, "Expected exactly one tool call" - assert response.tool_calls[0].name == "CalculateSkill", "Expected CalculateSkill" - assert response.tool_calls[0].status == "completed", "Expected completed status" - - # Clean up - agent.dispose() - - -if __name__ == "__main__": - # Run tests - test_mock_agent_tools() - print("✅ Mock agent tools test passed") - - test_base_agent_direct_tools() - print("✅ Direct agent tools test passed") - - asyncio.run(test_agent_module_with_tools()) - print("✅ Module agent tools test passed") - - print("\n✅ All production tool tests passed!") diff --git a/dimos/agents/test_agent_with_modules.py b/dimos/agents/test_agent_with_modules.py deleted file mode 100644 index 1a4ac70f65..0000000000 --- a/dimos/agents/test_agent_with_modules.py +++ /dev/null @@ -1,157 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Test agent module with proper module connections.""" - -import asyncio - -from dotenv import load_dotenv -import pytest - -from dimos import core -from dimos.agents.agent_message import AgentMessage -from dimos.agents.agent_types import AgentResponse -from dimos.agents.modules.base_agent import BaseAgentModule -from dimos.core import In, Module, Out, rpc -from dimos.protocol import pubsub - - -# Test query sender module -class QuerySender(Module): - """Module to send test queries.""" - - message_out: Out[AgentMessage] = None - - def __init__(self) -> None: - super().__init__() - - @rpc - def send_query(self, query: str) -> None: - """Send a query.""" - print(f"Sending query: {query}") - msg = AgentMessage() - msg.add_text(query) - self.message_out.publish(msg) - - -# Test response collector module -class ResponseCollector(Module): - """Module to collect responses.""" - - response_in: In[AgentResponse] = None - - def __init__(self) -> None: - super().__init__() - self.responses = [] - - @rpc - def start(self) -> None: - """Start collecting.""" - self.response_in.subscribe(self._on_response) - - def _on_response(self, msg: AgentResponse) -> None: - print(f"Received response: {msg.content if msg.content else msg}") - self.responses.append(msg) - - @rpc - def get_responses(self): - """Get collected responses.""" - return self.responses - - -@pytest.mark.tofix -@pytest.mark.module -@pytest.mark.asyncio -async def test_agent_module_connections() -> None: - """Test agent module with proper connections.""" - load_dotenv() - pubsub.lcm.autoconf() - - # Start Dask - dimos = core.start(4) - - try: - # Deploy modules - sender = dimos.deploy(QuerySender) - agent = dimos.deploy( - BaseAgentModule, - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant. Answer in 10 words or less.", - ) - collector = dimos.deploy(ResponseCollector) - - # Configure transports - sender.message_out.transport = core.pLCMTransport("/messages") - agent.response_out.transport = core.pLCMTransport("/responses") - - # Connect modules - agent.message_in.connect(sender.message_out) - collector.response_in.connect(agent.response_out) - - # Start modules - agent.start() - collector.start() - - # Wait for initialization - await asyncio.sleep(1) - - # Test 1: Simple query - print("\n=== Test 1: Simple Query ===") - sender.send_query("What is 2+2?") - - await asyncio.sleep(5) # Increased wait time for API response - - responses = collector.get_responses() - assert len(responses) > 0, "Should have received a response" - assert isinstance(responses[0], AgentResponse), "Expected AgentResponse object" - assert "4" in responses[0].content or "four" in responses[0].content.lower(), ( - "Should calculate correctly" - ) - - # Test 2: Another query - print("\n=== Test 2: Another Query ===") - sender.send_query("What color is the sky?") - - await asyncio.sleep(5) # Increased wait time - - responses = collector.get_responses() - assert len(responses) >= 2, "Should have at least two responses" - assert isinstance(responses[1], AgentResponse), "Expected AgentResponse object" - assert "blue" in responses[1].content.lower(), "Should mention blue" - - # Test 3: Multiple queries - print("\n=== Test 3: Multiple Queries ===") - queries = ["Count from 1 to 3", "Name a fruit", "What is Python?"] - - for q in queries: - sender.send_query(q) - await asyncio.sleep(2) # Give more time between queries - - await asyncio.sleep(8) # More time for multiple queries - - responses = collector.get_responses() - assert len(responses) >= 4, f"Should have at least 4 responses, got {len(responses)}" - - # Stop modules - agent.stop() - - print("\n=== All tests passed! ===") - - finally: - dimos.close() - dimos.shutdown() - - -if __name__ == "__main__": - asyncio.run(test_agent_module_connections()) diff --git a/dimos/agents/test_base_agent_text.py b/dimos/agents/test_base_agent_text.py deleted file mode 100644 index 022bea9cd2..0000000000 --- a/dimos/agents/test_base_agent_text.py +++ /dev/null @@ -1,562 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Test BaseAgent text functionality.""" - -import asyncio -import os - -from dotenv import load_dotenv -import pytest - -from dimos import core -from dimos.agents.agent_message import AgentMessage -from dimos.agents.agent_types import AgentResponse -from dimos.agents.modules.base import BaseAgent -from dimos.agents.modules.base_agent import BaseAgentModule -from dimos.core import In, Module, Out, rpc -from dimos.protocol import pubsub - - -class QuerySender(Module): - """Module to send test queries.""" - - message_out: Out[AgentMessage] = None # New AgentMessage output - - @rpc - def send_query(self, query: str) -> None: - """Send a query as AgentMessage.""" - msg = AgentMessage() - msg.add_text(query) - self.message_out.publish(msg) - - @rpc - def send_message(self, message: AgentMessage) -> None: - """Send an AgentMessage.""" - self.message_out.publish(message) - - -class ResponseCollector(Module): - """Module to collect responses.""" - - response_in: In[AgentResponse] = None - - def __init__(self) -> None: - super().__init__() - self.responses = [] - - @rpc - def start(self) -> None: - """Start collecting.""" - self.response_in.subscribe(self._on_response) - - def _on_response(self, msg) -> None: - self.responses.append(msg) - - @rpc - def get_responses(self): - """Get collected responses.""" - return self.responses - - -@pytest.mark.tofix -def test_base_agent_direct_text() -> None: - """Test BaseAgent direct text usage.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - # Create agent - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant. Answer in 10 words or less.", - temperature=0.0, - seed=42, # Fixed seed for deterministic results - ) - - # Test simple query with string (backward compatibility) - response = agent.query("What is 2+2?") - print(f"\n[Test] Query: 'What is 2+2?' -> Response: '{response.content}'") - assert response.content is not None - assert "4" in response.content or "four" in response.content.lower(), ( - f"Expected '4' or 'four' in response, got: {response.content}" - ) - - # Test with AgentMessage - msg = AgentMessage() - msg.add_text("What is 3+3?") - response = agent.query(msg) - print(f"[Test] Query: 'What is 3+3?' -> Response: '{response.content}'") - assert response.content is not None - assert "6" in response.content or "six" in response.content.lower(), ( - "Expected '6' or 'six' in response" - ) - - # Test conversation history - response = agent.query("What was my previous question?") - print(f"[Test] Query: 'What was my previous question?' -> Response: '{response.content}'") - assert response.content is not None - # The agent should reference one of the previous questions - # It might say "2+2" or "3+3" depending on interpretation of "previous" - assert ( - "2+2" in response.content or "3+3" in response.content or "What is" in response.content - ), f"Expected reference to a previous question, got: {response.content}" - - # Clean up - agent.dispose() - - -@pytest.mark.tofix -@pytest.mark.asyncio -async def test_base_agent_async_text() -> None: - """Test BaseAgent async text usage.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - # Create agent - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant.", - temperature=0.0, - seed=42, - ) - - # Test async query with string - response = await agent.aquery("What is the capital of France?") - assert response.content is not None - assert "Paris" in response.content, "Expected 'Paris' in response" - - # Test async query with AgentMessage - msg = AgentMessage() - msg.add_text("What is the capital of Germany?") - response = await agent.aquery(msg) - assert response.content is not None - assert "Berlin" in response.content, "Expected 'Berlin' in response" - - # Clean up - agent.dispose() - - -@pytest.mark.tofix -@pytest.mark.module -@pytest.mark.asyncio -async def test_base_agent_module_text() -> None: - """Test BaseAgentModule with text via DimOS.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - pubsub.lcm.autoconf() - dimos = core.start(4) - - try: - # Deploy modules - sender = dimos.deploy(QuerySender) - agent = dimos.deploy( - BaseAgentModule, - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant. Answer concisely.", - ) - collector = dimos.deploy(ResponseCollector) - - # Configure transports - sender.message_out.transport = core.pLCMTransport("/test/messages") - agent.response_out.transport = core.pLCMTransport("/test/responses") - - # Connect modules - agent.message_in.connect(sender.message_out) - collector.response_in.connect(agent.response_out) - - # Start modules - agent.start() - collector.start() - - # Wait for initialization - await asyncio.sleep(1) - - # Test queries - sender.send_query("What is 2+2?") - await asyncio.sleep(3) - - responses = collector.get_responses() - assert len(responses) > 0, "Should have received a response" - resp = responses[0] - assert isinstance(resp, AgentResponse), "Expected AgentResponse object" - assert "4" in resp.content or "four" in resp.content.lower(), ( - f"Expected '4' or 'four' in response, got: {resp.content}" - ) - - # Test another query - sender.send_query("What color is the sky?") - await asyncio.sleep(3) - - responses = collector.get_responses() - assert len(responses) >= 2, "Should have at least two responses" - resp = responses[1] - assert isinstance(resp, AgentResponse), "Expected AgentResponse object" - assert "blue" in resp.content.lower(), "Expected 'blue' in response" - - # Test conversation history - sender.send_query("What was my first question?") - await asyncio.sleep(3) - - responses = collector.get_responses() - assert len(responses) >= 3, "Should have at least three responses" - resp = responses[2] - assert isinstance(resp, AgentResponse), "Expected AgentResponse object" - assert "2+2" in resp.content or "2" in resp.content, "Expected reference to first question" - - # Stop modules - agent.stop() - - finally: - dimos.close() - dimos.shutdown() - - -@pytest.mark.parametrize( - "model,provider", - [ - ("openai::gpt-4o-mini", "openai"), - ("anthropic::claude-3-haiku-20240307", "anthropic"), - ("cerebras::llama-3.3-70b", "cerebras"), - ], -) -@pytest.mark.tofix -def test_base_agent_providers(model, provider) -> None: - """Test BaseAgent with different providers.""" - load_dotenv() - - # Check for API key - api_key_map = { - "openai": "OPENAI_API_KEY", - "anthropic": "ANTHROPIC_API_KEY", - "cerebras": "CEREBRAS_API_KEY", - } - - if not os.getenv(api_key_map[provider]): - pytest.skip(f"No {api_key_map[provider]} found") - - # Create agent - agent = BaseAgent( - model=model, - system_prompt="You are a helpful assistant. Answer in 10 words or less.", - temperature=0.0, - seed=42, - ) - - # Test query with AgentMessage - msg = AgentMessage() - msg.add_text("What is the capital of France?") - response = agent.query(msg) - assert response.content is not None - assert "Paris" in response.content, f"Expected 'Paris' in response from {provider}" - - # Clean up - agent.dispose() - - -@pytest.mark.tofix -def test_base_agent_memory() -> None: - """Test BaseAgent with memory/RAG.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - # Create agent - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant. Use the provided context when answering.", - temperature=0.0, - rag_threshold=0.3, - seed=42, - ) - - # Add context to memory - agent.memory.add_vector("doc1", "The DimOS framework is designed for building robotic systems.") - agent.memory.add_vector( - "doc2", "Robots using DimOS can perform navigation and manipulation tasks." - ) - - # Test RAG retrieval with AgentMessage - msg = AgentMessage() - msg.add_text("What is DimOS?") - response = agent.query(msg) - assert response.content is not None - assert "framework" in response.content.lower() or "robotic" in response.content.lower(), ( - "Expected context about DimOS in response" - ) - - # Clean up - agent.dispose() - - -class MockAgent(BaseAgent): - """Mock agent for testing without API calls.""" - - def __init__(self, **kwargs) -> None: - # Don't call super().__init__ to avoid gateway initialization - from dimos.agents.agent_types import ConversationHistory - - self.model = kwargs.get("model", "mock::test") - self.system_prompt = kwargs.get("system_prompt", "Mock agent") - self.conversation = ConversationHistory(max_size=20) - self._supports_vision = False - self.response_subject = None # Simplified - - async def _process_query_async(self, query: str, base64_image=None) -> str: - """Mock response.""" - if "2+2" in query: - return "The answer is 4" - elif "capital" in query and "France" in query: - return "The capital of France is Paris" - elif "color" in query and "sky" in query: - return "The sky is blue" - elif "previous" in query: - history = self.conversation.to_openai_format() - if len(history) >= 2: - # Get the second to last item (the last user query before this one) - for i in range(len(history) - 2, -1, -1): - if history[i]["role"] == "user": - return f"Your previous question was: {history[i]['content']}" - return "No previous questions" - else: - return f"Mock response to: {query}" - - def query(self, message) -> AgentResponse: - """Mock synchronous query.""" - # Convert to text if AgentMessage - if isinstance(message, AgentMessage): - text = message.get_combined_text() - else: - text = message - - # Update conversation history - self.conversation.add_user_message(text) - response = asyncio.run(self._process_query_async(text)) - self.conversation.add_assistant_message(response) - return AgentResponse(content=response) - - async def aquery(self, message) -> AgentResponse: - """Mock async query.""" - # Convert to text if AgentMessage - if isinstance(message, AgentMessage): - text = message.get_combined_text() - else: - text = message - - self.conversation.add_user_message(text) - response = await self._process_query_async(text) - self.conversation.add_assistant_message(response) - return AgentResponse(content=response) - - def dispose(self) -> None: - """Mock dispose.""" - pass - - -@pytest.mark.tofix -def test_mock_agent() -> None: - """Test mock agent for CI without API keys.""" - # Create mock agent - agent = MockAgent(model="mock::test", system_prompt="Mock assistant") - - # Test simple query - response = agent.query("What is 2+2?") - assert isinstance(response, AgentResponse), "Expected AgentResponse object" - assert "4" in response.content - - # Test conversation history - response = agent.query("What was my previous question?") - assert isinstance(response, AgentResponse), "Expected AgentResponse object" - assert "2+2" in response.content - - # Test other queries - response = agent.query("What is the capital of France?") - assert isinstance(response, AgentResponse), "Expected AgentResponse object" - assert "Paris" in response.content - - response = agent.query("What color is the sky?") - assert isinstance(response, AgentResponse), "Expected AgentResponse object" - assert "blue" in response.content.lower() - - # Clean up - agent.dispose() - - -@pytest.mark.tofix -def test_base_agent_conversation_history() -> None: - """Test that conversation history is properly maintained.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - # Create agent - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant.", - temperature=0.0, - seed=42, - ) - - # Test 1: Simple conversation - response1 = agent.query("My name is Alice") - assert isinstance(response1, AgentResponse) - - # Check conversation history has both messages - assert agent.conversation.size() == 2 - history = agent.conversation.to_openai_format() - assert history[0]["role"] == "user" - assert history[0]["content"] == "My name is Alice" - assert history[1]["role"] == "assistant" - - # Test 2: Reference previous context - response2 = agent.query("What is my name?") - assert "Alice" in response2.content, "Agent should remember the name" - - # Conversation history should now have 4 messages - assert agent.conversation.size() == 4 - - # Test 3: Multiple text parts in AgentMessage - msg = AgentMessage() - msg.add_text("Calculate") - msg.add_text("the sum of") - msg.add_text("5 + 3") - - response3 = agent.query(msg) - assert "8" in response3.content or "eight" in response3.content.lower() - - # Check the combined text was stored correctly - assert agent.conversation.size() == 6 - history = agent.conversation.to_openai_format() - assert history[4]["role"] == "user" - assert history[4]["content"] == "Calculate the sum of 5 + 3" - - # Test 4: History trimming (set low limit) - agent.max_history = 4 - agent.query("What was my first message?") - - # Conversation history should be trimmed to 4 messages - assert agent.conversation.size() == 4 - # First messages should be gone - history = agent.conversation.to_openai_format() - assert "Alice" not in history[0]["content"] - - # Clean up - agent.dispose() - - -@pytest.mark.tofix -def test_base_agent_history_with_tools() -> None: - """Test conversation history with tool calls.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - from pydantic import Field - - from dimos.skills.skills import AbstractSkill, SkillLibrary - - class CalculatorSkill(AbstractSkill): - """Perform calculations.""" - - expression: str = Field(description="Mathematical expression") - - def __call__(self) -> str: - try: - result = eval(self.expression) - return f"The result is {result}" - except: - return "Error in calculation" - - # Create agent with calculator skill - skills = SkillLibrary() - skills.add(CalculatorSkill) - - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant with a calculator. Use the calculator tool when asked to compute something.", - skills=skills, - temperature=0.0, - seed=42, - ) - - # Make a query that should trigger tool use - response = agent.query("Please calculate 42 * 17 using the calculator tool") - - # Check response - assert isinstance(response, AgentResponse) - assert "714" in response.content, f"Expected 714 in response, got: {response.content}" - - # Check tool calls were made - if response.tool_calls: - assert len(response.tool_calls) > 0 - assert response.tool_calls[0].name == "CalculatorSkill" - assert response.tool_calls[0].status == "completed" - - # Check history structure - # If tools were called, we should have more messages - if response.tool_calls and len(response.tool_calls) > 0: - assert agent.conversation.size() >= 3, ( - f"Expected at least 3 messages in history when tools are used, got {agent.conversation.size()}" - ) - - # Find the assistant message with tool calls - history = agent.conversation.to_openai_format() - tool_msg_found = False - tool_result_found = False - - for msg in history: - if msg.get("role") == "assistant" and msg.get("tool_calls"): - tool_msg_found = True - if msg.get("role") == "tool": - tool_result_found = True - assert "result" in msg.get("content", "").lower() - - assert tool_msg_found, "Tool call message should be in history when tools were used" - assert tool_result_found, "Tool result should be in history when tools were used" - else: - # No tools used, just verify we have user and assistant messages - assert agent.conversation.size() >= 2, ( - f"Expected at least 2 messages in history, got {agent.conversation.size()}" - ) - # The model solved it without using the tool - that's also acceptable - print("Note: Model solved without using the calculator tool") - - # Clean up - agent.dispose() - - -if __name__ == "__main__": - test_base_agent_direct_text() - asyncio.run(test_base_agent_async_text()) - asyncio.run(test_base_agent_module_text()) - test_base_agent_memory() - test_mock_agent() - test_base_agent_conversation_history() - test_base_agent_history_with_tools() - print("\n✅ All text tests passed!") - test_base_agent_direct_text() - asyncio.run(test_base_agent_async_text()) - asyncio.run(test_base_agent_module_text()) - test_base_agent_memory() - test_mock_agent() - print("\n✅ All text tests passed!") diff --git a/dimos/agents/test_conversation_history.py b/dimos/agents/test_conversation_history.py deleted file mode 100644 index 95b28fbc0b..0000000000 --- a/dimos/agents/test_conversation_history.py +++ /dev/null @@ -1,416 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Comprehensive conversation history tests for agents.""" - -import asyncio -import logging -import os - -from dotenv import load_dotenv -import numpy as np -from pydantic import Field -import pytest - -from dimos.agents.agent_message import AgentMessage -from dimos.agents.agent_types import AgentResponse -from dimos.agents.modules.base import BaseAgent -from dimos.msgs.sensor_msgs import Image -from dimos.skills.skills import AbstractSkill, SkillLibrary - -logger = logging.getLogger(__name__) - - -@pytest.mark.tofix -def test_conversation_history_basic() -> None: - """Test basic conversation history functionality.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant with perfect memory.", - temperature=0.0, - seed=42, - ) - - try: - # Test 1: Simple text conversation - response1 = agent.query("My favorite color is blue") - assert isinstance(response1, AgentResponse) - assert agent.conversation.size() == 2 # user + assistant - - # Test 2: Reference previous information - response2 = agent.query("What is my favorite color?") - assert "blue" in response2.content.lower(), "Agent should remember the color" - assert agent.conversation.size() == 4 - - # Test 3: Multiple facts - agent.query("I live in San Francisco") - agent.query("I work as an engineer") - - # Verify history is building up - assert agent.conversation.size() == 8 # 4 exchanges (blue, what color, SF, engineer) - - response = agent.query("Tell me what you know about me") - - # Check if agent remembers at least some facts - # Note: Models may sometimes give generic responses, so we check for any memory - facts_mentioned = 0 - if "blue" in response.content.lower() or "color" in response.content.lower(): - facts_mentioned += 1 - if "san francisco" in response.content.lower() or "francisco" in response.content.lower(): - facts_mentioned += 1 - if "engineer" in response.content.lower(): - facts_mentioned += 1 - - # Agent should remember at least one fact, or acknowledge the conversation - assert facts_mentioned > 0 or "know" in response.content.lower(), ( - f"Agent should show some memory of conversation, got: {response.content}" - ) - - # Verify history properly accumulates - assert agent.conversation.size() == 10 - - finally: - agent.dispose() - - -@pytest.mark.tofix -def test_conversation_history_with_images() -> None: - """Test conversation history with multimodal content.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful vision assistant.", - temperature=0.0, - seed=42, - ) - - try: - # Send text message - agent.query("I'm going to show you some colors") - assert agent.conversation.size() == 2 - - # Send image with text - msg = AgentMessage() - msg.add_text("This is a red square") - red_img = Image(data=np.full((100, 100, 3), [255, 0, 0], dtype=np.uint8)) - msg.add_image(red_img) - - agent.query(msg) - assert agent.conversation.size() == 4 - - # Ask about the image - response3 = agent.query("What color did I just show you?") - # Check for any color mention (models sometimes see colors differently) - assert any( - color in response3.content.lower() - for color in ["red", "blue", "green", "color", "square"] - ), f"Should mention a color, got: {response3.content}" - - # Send another image - msg2 = AgentMessage() - msg2.add_text("Now here's a blue square") - blue_img = Image(data=np.full((100, 100, 3), [0, 0, 255], dtype=np.uint8)) - msg2.add_image(blue_img) - - agent.query(msg2) - assert agent.conversation.size() == 8 - - # Ask about all images - response5 = agent.query("What colors have I shown you?") - # Should mention seeing images/colors even if specific colors are wrong - assert any( - word in response5.content.lower() - for word in ["red", "blue", "colors", "squares", "images", "shown", "two"] - ), f"Should acknowledge seeing images, got: {response5.content}" - - # Verify both message types are in history - assert agent.conversation.size() == 10 - - finally: - agent.dispose() - - -@pytest.mark.tofix -def test_conversation_history_trimming() -> None: - """Test that conversation history is trimmed to max size.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - # Create agent with small history limit - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant.", - temperature=0.0, - max_history=3, # Keep 3 message pairs (6 messages total) - seed=42, - ) - - try: - # Add several messages - agent.query("Message 1: I like apples") - assert agent.conversation.size() == 2 - - agent.query("Message 2: I like oranges") - # Now we have 2 pairs (4 messages) - # max_history=3 means we keep max 3 messages total (not pairs!) - size = agent.conversation.size() - # After trimming to 3, we'd have kept the most recent 3 messages - assert size == 3, f"After Message 2, size should be 3, got {size}" - - agent.query("Message 3: I like bananas") - size = agent.conversation.size() - assert size == 3, f"After Message 3, size should be 3, got {size}" - - # This should maintain trimming - agent.query("Message 4: I like grapes") - size = agent.conversation.size() - assert size == 3, f"After Message 4, size should still be 3, got {size}" - - # Add one more - agent.query("Message 5: I like strawberries") - size = agent.conversation.size() - assert size == 3, f"After Message 5, size should still be 3, got {size}" - - # Early messages should be trimmed - agent.query("What was the first fruit I mentioned?") - size = agent.conversation.size() - assert size == 3, f"After question, size should still be 3, got {size}" - - # Change max_history dynamically - agent.max_history = 2 - agent.query("New message after resize") - # Now history should be trimmed to 2 messages - size = agent.conversation.size() - assert size == 2, f"After resize to max_history=2, size should be 2, got {size}" - - finally: - agent.dispose() - - -@pytest.mark.tofix -def test_conversation_history_with_tools() -> None: - """Test conversation history with tool calls.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - # Create a simple skill - class CalculatorSkillLocal(AbstractSkill): - """A simple calculator skill.""" - - expression: str = Field(description="Mathematical expression to evaluate") - - def __call__(self) -> str: - try: - result = eval(self.expression) - return f"The result is {result}" - except Exception as e: - return f"Error: {e}" - - # Create skill library properly - class TestSkillLibrary(SkillLibrary): - CalculatorSkill = CalculatorSkillLocal - - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant with access to a calculator.", - skills=TestSkillLibrary(), - temperature=0.0, - seed=100, - ) - - try: - # Initial query - agent.query("Hello, I need help with math") - assert agent.conversation.size() == 2 - - # Force tool use explicitly - response2 = agent.query( - "I need you to use the CalculatorSkill tool to compute 123 * 456. " - "Do NOT calculate it yourself - you MUST use the calculator tool function." - ) - - assert agent.conversation.size() == 6 # 2 + 1 + 3 - assert response2.tool_calls is not None and len(response2.tool_calls) > 0 - assert "56088" in response2.content.replace(",", "") - - # Ask about previous calculation - response3 = agent.query("What was the result of the calculation?") - assert "56088" in response3.content.replace(",", "") or "123" in response3.content.replace( - ",", "" - ) - assert agent.conversation.size() == 8 - - finally: - agent.dispose() - - -@pytest.mark.tofix -def test_conversation_thread_safety() -> None: - """Test that conversation history is thread-safe.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - agent = BaseAgent(model="openai::gpt-4o-mini", temperature=0.0, seed=42) - - try: - - async def query_async(text: str): - """Async wrapper for query.""" - return await agent.aquery(text) - - async def run_concurrent(): - """Run multiple queries concurrently.""" - tasks = [query_async(f"Query {i}") for i in range(3)] - return await asyncio.gather(*tasks) - - # Run concurrent queries - results = asyncio.run(run_concurrent()) - assert len(results) == 3 - - # Should have roughly 6 messages (3 queries * 2) - # Exact count may vary due to thread timing - assert agent.conversation.size() >= 4 - assert agent.conversation.size() <= 6 - - finally: - agent.dispose() - - -@pytest.mark.tofix -def test_conversation_history_formats() -> None: - """Test ConversationHistory formatting methods.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - agent = BaseAgent(model="openai::gpt-4o-mini", temperature=0.0, seed=42) - - try: - # Create a conversation - agent.conversation.add_user_message("Hello") - agent.conversation.add_assistant_message("Hi there!") - - # Test text with images - agent.conversation.add_user_message( - [ - {"type": "text", "text": "Look at this"}, - {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64,abc123"}}, - ] - ) - agent.conversation.add_assistant_message("I see the image") - - # Test tool messages - agent.conversation.add_assistant_message( - content="", - tool_calls=[ - { - "id": "call_123", - "type": "function", - "function": {"name": "test", "arguments": "{}"}, - } - ], - ) - agent.conversation.add_tool_result( - tool_call_id="call_123", content="Tool result", name="test" - ) - - # Get OpenAI format - messages = agent.conversation.to_openai_format() - assert len(messages) == 6 - - # Verify message formats - assert messages[0]["role"] == "user" - assert messages[0]["content"] == "Hello" - - assert messages[2]["role"] == "user" - assert isinstance(messages[2]["content"], list) - - # Tool response message should be at index 5 (after assistant with tool_calls at index 4) - assert messages[5]["role"] == "tool" - assert messages[5]["tool_call_id"] == "call_123" - assert messages[5]["name"] == "test" - - finally: - agent.dispose() - - -@pytest.mark.tofix -@pytest.mark.timeout(30) # Add timeout to prevent hanging -def test_conversation_edge_cases() -> None: - """Test edge cases in conversation history.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OPENAI_API_KEY found") - - agent = BaseAgent( - model="openai::gpt-4o-mini", - system_prompt="You are a helpful assistant.", - temperature=0.0, - seed=42, - ) - - try: - # Empty message - msg1 = AgentMessage() - msg1.add_text("") - response1 = agent.query(msg1) - assert response1.content is not None - - # Moderately long message (reduced from 1000 to 100 words) - long_text = "word " * 100 - response2 = agent.query(long_text) - assert response2.content is not None - - # Multiple text parts that combine - msg3 = AgentMessage() - for i in range(5): # Reduced from 10 to 5 - msg3.add_text(f"Part {i} ") - response3 = agent.query(msg3) - assert response3.content is not None - - # Verify history is maintained correctly - assert agent.conversation.size() == 6 # 3 exchanges - - finally: - agent.dispose() - - -if __name__ == "__main__": - # Run tests - test_conversation_history_basic() - test_conversation_history_with_images() - test_conversation_history_trimming() - test_conversation_history_with_tools() - test_conversation_thread_safety() - test_conversation_history_formats() - test_conversation_edge_cases() - print("\n✅ All conversation history tests passed!") diff --git a/dimos/agents/test_gateway.py b/dimos/agents/test_gateway.py deleted file mode 100644 index 2c54d5d1ac..0000000000 --- a/dimos/agents/test_gateway.py +++ /dev/null @@ -1,203 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Test gateway functionality.""" - -import asyncio -import os - -from dotenv import load_dotenv -import pytest - -from dimos.agents.modules.gateway import UnifiedGatewayClient - - -@pytest.mark.tofix -@pytest.mark.asyncio -async def test_gateway_basic() -> None: - """Test basic gateway functionality.""" - load_dotenv() - - # Check for at least one API key - has_api_key = any( - [os.getenv("OPENAI_API_KEY"), os.getenv("ANTHROPIC_API_KEY"), os.getenv("CEREBRAS_API_KEY")] - ) - - if not has_api_key: - pytest.skip("No API keys found for gateway test") - - gateway = UnifiedGatewayClient() - - try: - # Test with available provider - if os.getenv("OPENAI_API_KEY"): - model = "openai::gpt-4o-mini" - elif os.getenv("ANTHROPIC_API_KEY"): - model = "anthropic::claude-3-haiku-20240307" - else: - model = "cerebras::llama3.1-8b" - - messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Say 'Hello Gateway' and nothing else."}, - ] - - # Test non-streaming - response = await gateway.ainference( - model=model, messages=messages, temperature=0.0, max_tokens=10 - ) - - assert "choices" in response - assert len(response["choices"]) > 0 - assert "message" in response["choices"][0] - assert "content" in response["choices"][0]["message"] - - content = response["choices"][0]["message"]["content"] - assert "hello" in content.lower() or "gateway" in content.lower() - - finally: - gateway.close() - - -@pytest.mark.tofix -@pytest.mark.asyncio -async def test_gateway_streaming() -> None: - """Test gateway streaming functionality.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("OpenAI API key required for streaming test") - - gateway = UnifiedGatewayClient() - - try: - messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Count from 1 to 3"}, - ] - - # Test streaming - chunks = [] - async for chunk in await gateway.ainference( - model="openai::gpt-4o-mini", messages=messages, temperature=0.0, stream=True - ): - chunks.append(chunk) - - assert len(chunks) > 0, "Should receive stream chunks" - - # Reconstruct content - content = "" - for chunk in chunks: - if chunk.get("choices"): - delta = chunk["choices"][0].get("delta", {}) - chunk_content = delta.get("content") - if chunk_content is not None: - content += chunk_content - - assert any(str(i) in content for i in [1, 2, 3]), "Should count numbers" - - finally: - gateway.close() - - -@pytest.mark.tofix -@pytest.mark.asyncio -async def test_gateway_tools() -> None: - """Test gateway can pass tool definitions to LLM and get responses.""" - load_dotenv() - - if not os.getenv("OPENAI_API_KEY"): - pytest.skip("OpenAI API key required for tools test") - - gateway = UnifiedGatewayClient() - - try: - # Just test that gateway accepts tools parameter and returns valid response - tools = [ - { - "type": "function", - "function": { - "name": "test_function", - "description": "A test function", - "parameters": { - "type": "object", - "properties": {"param": {"type": "string"}}, - }, - }, - } - ] - - messages = [ - {"role": "user", "content": "Hello, just testing the gateway"}, - ] - - # Just verify gateway doesn't crash when tools are provided - response = await gateway.ainference( - model="openai::gpt-4o-mini", messages=messages, tools=tools, temperature=0.0 - ) - - # Basic validation - gateway returned something - assert "choices" in response - assert len(response["choices"]) > 0 - assert "message" in response["choices"][0] - - finally: - gateway.close() - - -@pytest.mark.tofix -@pytest.mark.asyncio -async def test_gateway_providers() -> None: - """Test gateway with different providers.""" - load_dotenv() - - gateway = UnifiedGatewayClient() - - providers_tested = 0 - - try: - # Test each available provider - test_cases = [ - ("openai::gpt-4o-mini", "OPENAI_API_KEY"), - ("anthropic::claude-3-haiku-20240307", "ANTHROPIC_API_KEY"), - # ("cerebras::llama3.1-8b", "CEREBRAS_API_KEY"), - ("qwen::qwen-turbo", "DASHSCOPE_API_KEY"), - ] - - for model, env_var in test_cases: - if not os.getenv(env_var): - continue - - providers_tested += 1 - - messages = [{"role": "user", "content": "Reply with just the word 'OK'"}] - - response = await gateway.ainference( - model=model, messages=messages, temperature=0.0, max_tokens=10 - ) - - assert "choices" in response - content = response["choices"][0]["message"]["content"] - assert len(content) > 0, f"{model} should return content" - - if providers_tested == 0: - pytest.skip("No API keys found for provider test") - - finally: - gateway.close() - - -if __name__ == "__main__": - load_dotenv() - asyncio.run(test_gateway_basic()) diff --git a/dimos/agents/test_simple_agent_module.py b/dimos/agents/test_simple_agent_module.py deleted file mode 100644 index bd374877dd..0000000000 --- a/dimos/agents/test_simple_agent_module.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Test simple agent module with string input/output.""" - -import asyncio -import os - -from dotenv import load_dotenv -import pytest - -from dimos import core -from dimos.agents.agent_message import AgentMessage -from dimos.agents.agent_types import AgentResponse -from dimos.agents.modules.base_agent import BaseAgentModule -from dimos.core import In, Module, Out, rpc -from dimos.protocol import pubsub - - -class QuerySender(Module): - """Module to send test queries.""" - - message_out: Out[AgentMessage] = None - - @rpc - def send_query(self, query: str) -> None: - """Send a query.""" - msg = AgentMessage() - msg.add_text(query) - self.message_out.publish(msg) - - -class ResponseCollector(Module): - """Module to collect responses.""" - - response_in: In[AgentResponse] = None - - def __init__(self) -> None: - super().__init__() - self.responses = [] - - @rpc - def start(self) -> None: - """Start collecting.""" - self.response_in.subscribe(self._on_response) - - def _on_response(self, response: AgentResponse) -> None: - """Handle response.""" - self.responses.append(response) - - @rpc - def get_responses(self) -> list: - """Get collected responses.""" - return self.responses - - @rpc - def clear(self) -> None: - """Clear responses.""" - self.responses = [] - - -@pytest.mark.tofix -@pytest.mark.module -@pytest.mark.asyncio -@pytest.mark.parametrize( - "model,provider", - [ - ("openai::gpt-4o-mini", "OpenAI"), - ("anthropic::claude-3-haiku-20240307", "Claude"), - ("cerebras::llama3.1-8b", "Cerebras"), - ("qwen::qwen-turbo", "Qwen"), - ], -) -async def test_simple_agent_module(model, provider) -> None: - """Test simple agent module with different providers.""" - load_dotenv() - - # Skip if no API key - if provider == "OpenAI" and not os.getenv("OPENAI_API_KEY"): - pytest.skip("No OpenAI API key found") - elif provider == "Claude" and not os.getenv("ANTHROPIC_API_KEY"): - pytest.skip("No Anthropic API key found") - elif provider == "Cerebras" and not os.getenv("CEREBRAS_API_KEY"): - pytest.skip("No Cerebras API key found") - elif provider == "Qwen" and not os.getenv("ALIBABA_API_KEY"): - pytest.skip("No Qwen API key found") - - pubsub.lcm.autoconf() - - # Start Dask cluster - dimos = core.start(3) - - try: - # Deploy modules - sender = dimos.deploy(QuerySender) - agent = dimos.deploy( - BaseAgentModule, - model=model, - system_prompt=f"You are a helpful {provider} assistant. Keep responses brief.", - ) - collector = dimos.deploy(ResponseCollector) - - # Configure transports - sender.message_out.transport = core.pLCMTransport(f"/test/{provider}/messages") - agent.response_out.transport = core.pLCMTransport(f"/test/{provider}/responses") - - # Connect modules - agent.message_in.connect(sender.message_out) - collector.response_in.connect(agent.response_out) - - # Start modules - agent.start() - collector.start() - - await asyncio.sleep(1) - - # Test simple math - sender.send_query("What is 2+2?") - await asyncio.sleep(5) - - responses = collector.get_responses() - assert len(responses) > 0, f"{provider} should respond" - assert isinstance(responses[0], AgentResponse), "Expected AgentResponse object" - assert "4" in responses[0].content, f"{provider} should calculate correctly" - - # Test brief response - collector.clear() - sender.send_query("Name one color.") - await asyncio.sleep(5) - - responses = collector.get_responses() - assert len(responses) > 0, f"{provider} should respond" - assert isinstance(responses[0], AgentResponse), "Expected AgentResponse object" - assert len(responses[0].content) < 200, f"{provider} should give brief response" - - # Stop modules - agent.stop() - - finally: - dimos.close() - dimos.shutdown() - - -@pytest.mark.tofix -@pytest.mark.module -@pytest.mark.asyncio -async def test_mock_agent_module() -> None: - """Test agent module with mock responses (no API needed).""" - pubsub.lcm.autoconf() - - class MockAgentModule(Module): - """Mock agent for testing.""" - - message_in: In[AgentMessage] = None - response_out: Out[AgentResponse] = None - - @rpc - def start(self) -> None: - self.message_in.subscribe(self._handle_message) - - def _handle_message(self, msg: AgentMessage) -> None: - query = msg.get_combined_text() - if "2+2" in query: - self.response_out.publish(AgentResponse(content="4")) - elif "color" in query.lower(): - self.response_out.publish(AgentResponse(content="Blue")) - else: - self.response_out.publish(AgentResponse(content=f"Mock response to: {query}")) - - dimos = core.start(2) - - try: - # Deploy - agent = dimos.deploy(MockAgentModule) - collector = dimos.deploy(ResponseCollector) - - # Configure - agent.message_in.transport = core.pLCMTransport("/mock/messages") - agent.response_out.transport = core.pLCMTransport("/mock/response") - - # Connect - collector.response_in.connect(agent.response_out) - - # Start - agent.start() - collector.start() - - await asyncio.sleep(1) - - # Test - use a simple query sender - sender = dimos.deploy(QuerySender) - sender.message_out.transport = core.pLCMTransport("/mock/messages") - agent.message_in.connect(sender.message_out) - - await asyncio.sleep(1) - - sender.send_query("What is 2+2?") - await asyncio.sleep(1) - - responses = collector.get_responses() - assert len(responses) == 1 - assert isinstance(responses[0], AgentResponse), "Expected AgentResponse object" - assert responses[0].content == "4" - - finally: - dimos.close() - dimos.shutdown() - - -if __name__ == "__main__": - asyncio.run(test_mock_agent_module()) diff --git a/dimos/agents2/temp/run_unitree_agents2.py b/dimos/agents2/temp/run_unitree_agents2.py deleted file mode 100644 index aacfd1b5f4..0000000000 --- a/dimos/agents2/temp/run_unitree_agents2.py +++ /dev/null @@ -1,187 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Run script for Unitree Go2 robot with agents2 framework. -This is the migrated version using the new LangChain-based agent system. -""" - -import os -from pathlib import Path -import sys -import time - -from dotenv import load_dotenv - -from dimos.agents2.cli.human import HumanInput - -# Add parent directories to path -sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) - - -from dimos.agents2 import Agent -from dimos.agents2.spec import Model, Provider -from dimos.robot.unitree_webrtc.unitree_go2 import UnitreeGo2 -from dimos.robot.unitree_webrtc.unitree_skill_container import UnitreeSkillContainer -from dimos.utils.logging_config import setup_logger - -logger = setup_logger("dimos.agents2.run_unitree") - -# Load environment variables -load_dotenv() - -# System prompt path -SYSTEM_PROMPT_PATH = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))), - "assets/agent/prompt.txt", -) - - -class UnitreeAgentRunner: - """Manages the Unitree robot with the new agents2 framework.""" - - def __init__(self) -> None: - self.robot = None - self.agent = None - self.agent_thread = None - self.running = False - - def setup_robot(self) -> UnitreeGo2: - """Initialize the robot connection.""" - logger.info("Initializing Unitree Go2 robot...") - - robot = UnitreeGo2( - ip=os.getenv("ROBOT_IP"), - connection_type=os.getenv("CONNECTION_TYPE", "webrtc"), - ) - - robot.start() - time.sleep(3) - - logger.info("Robot initialized successfully") - return robot - - def setup_agent(self, skillcontainers, system_prompt: str) -> Agent: - """Create and configure the agent with skills.""" - logger.info("Setting up agent with skills...") - - # Create agent - agent = Agent( - system_prompt=system_prompt, - model=Model.GPT_4O, # Could add CLAUDE models to enum - provider=Provider.OPENAI, # Would need ANTHROPIC provider - ) - - for container in skillcontainers: - print("REGISTERING SKILLS FROM CONTAINER:", container) - agent.register_skills(container) - - agent.run_implicit_skill("human") - - agent.start() - - # Log available skills - names = ", ".join([tool.name for tool in agent.get_tools()]) - logger.info(f"Agent configured with {len(names)} skills: {names}") - - agent.loop_thread() - return agent - - def run(self) -> None: - """Main run loop.""" - print("\n" + "=" * 60) - print("Unitree Go2 Robot with agents2 Framework") - print("=" * 60) - print("\nThis system integrates:") - print(" - Unitree Go2 quadruped robot") - print(" - WebRTC communication interface") - print(" - LangChain-based agent system (agents2)") - print(" - Converted skill system with @skill decorators") - print("\nStarting system...\n") - - # Check for API key (would need ANTHROPIC_API_KEY for Claude) - if not os.getenv("OPENAI_API_KEY"): - print("WARNING: OPENAI_API_KEY not found in environment") - print("Please set your API key in .env file or environment") - print("(Note: Full Claude support would require ANTHROPIC_API_KEY)") - sys.exit(1) - - system_prompt = """You are a helpful robot assistant controlling a Unitree Go2 quadruped robot. -You can move, navigate, speak, and perform various actions. Be helpful and friendly.""" - - try: - # Setup components - self.robot = self.setup_robot() - - self.agent = self.setup_agent( - [ - UnitreeSkillContainer(self.robot), - HumanInput(), - ], - system_prompt, - ) - - # Start handling queries - self.running = True - - logger.info("=" * 60) - logger.info("Unitree Go2 Agent Ready (agents2 framework)!") - logger.info("You can:") - logger.info(" - Type commands in the human cli") - logger.info(" - Ask the robot to move or navigate") - logger.info(" - Ask the robot to perform actions (sit, stand, dance, etc.)") - logger.info(" - Ask the robot to speak text") - logger.info("=" * 60) - - while True: - time.sleep(1) - except KeyboardInterrupt: - logger.info("Keyboard interrupt received") - except Exception as e: - logger.error(f"Error running robot: {e}") - import traceback - - traceback.print_exc() - # finally: - # self.shutdown() - - def shutdown(self) -> None: - logger.info("Shutting down...") - self.running = False - - if self.agent: - try: - self.agent.stop() - logger.info("Agent stopped") - except Exception as e: - logger.error(f"Error stopping agent: {e}") - - if self.robot: - try: - self.robot.stop() - logger.info("Robot connection closed") - except Exception as e: - logger.error(f"Error stopping robot: {e}") - - logger.info("Shutdown complete") - - -def main() -> None: - runner = UnitreeAgentRunner() - runner.run() - - -if __name__ == "__main__": - main() diff --git a/dimos/agents2/temp/run_unitree_async.py b/dimos/agents2/temp/run_unitree_async.py deleted file mode 100644 index 29213c1c90..0000000000 --- a/dimos/agents2/temp/run_unitree_async.py +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Async version of the Unitree run file for agents2. -Properly handles the async nature of the agent. -""" - -import asyncio -import os -from pathlib import Path -import sys - -from dotenv import load_dotenv - -# Add parent directories to path -sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent)) - -from dimos.agents2 import Agent -from dimos.agents2.spec import Model, Provider -from dimos.robot.unitree_webrtc.unitree_go2 import UnitreeGo2 -from dimos.robot.unitree_webrtc.unitree_skill_container import UnitreeSkillContainer -from dimos.utils.logging_config import setup_logger - -logger = setup_logger("run_unitree_async") - -# Load environment variables -load_dotenv() - -# System prompt path -SYSTEM_PROMPT_PATH = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))), - "assets/agent/prompt.txt", -) - - -async def handle_query(agent, query_text): - """Handle a single query asynchronously.""" - logger.info(f"Processing query: {query_text}") - - try: - # Use query_async which returns a Future - future = agent.query_async(query_text) - - # Wait for the result (with timeout) - await asyncio.wait_for(asyncio.wrap_future(future), timeout=30.0) - - # Get the result - if future.done(): - result = future.result() - logger.info(f"Agent response: {result}") - return result - else: - logger.warning("Query did not complete") - return "Query timeout" - - except asyncio.TimeoutError: - logger.error("Query timed out after 30 seconds") - return "Query timeout" - except Exception as e: - logger.error(f"Error processing query: {e}") - return f"Error: {e!s}" - - -async def interactive_loop(agent) -> None: - """Run an interactive query loop.""" - print("\n" + "=" * 60) - print("Interactive Agent Mode") - print("Type your commands or 'quit' to exit") - print("=" * 60 + "\n") - - while True: - try: - # Get user input - query = input("\nYou: ").strip() - - if query.lower() in ["quit", "exit", "q"]: - break - - if not query: - continue - - # Process query - response = await handle_query(agent, query) - print(f"\nAgent: {response}") - - except KeyboardInterrupt: - break - except Exception as e: - logger.error(f"Error in interactive loop: {e}") - - -async def main() -> None: - """Main async function.""" - print("\n" + "=" * 60) - print("Unitree Go2 Robot with agents2 Framework (Async)") - print("=" * 60) - - # Check for API key - if not os.getenv("OPENAI_API_KEY"): - print("ERROR: OPENAI_API_KEY not found") - print("Set your API key in .env file or environment") - sys.exit(1) - - # Load system prompt - try: - with open(SYSTEM_PROMPT_PATH) as f: - system_prompt = f.read() - except FileNotFoundError: - system_prompt = """You are a helpful robot assistant controlling a Unitree Go2 robot. -You have access to various movement and control skills. Be helpful and concise.""" - - # Initialize robot (optional - comment out if no robot) - robot = None - if os.getenv("ROBOT_IP"): - try: - logger.info("Connecting to robot...") - robot = UnitreeGo2( - ip=os.getenv("ROBOT_IP"), - connection_type=os.getenv("CONNECTION_TYPE", "webrtc"), - ) - robot.start() - await asyncio.sleep(3) - logger.info("Robot connected") - except Exception as e: - logger.warning(f"Could not connect to robot: {e}") - logger.info("Continuing without robot...") - - # Create skill container - skill_container = UnitreeSkillContainer(robot=robot) - - # Create agent - agent = Agent( - system_prompt=system_prompt, - model=Model.GPT_4O_MINI, # Using mini for faster responses - provider=Provider.OPENAI, - ) - - # Register skills and start - agent.register_skills(skill_container) - agent.start() - - # Log available skills - skills = skill_container.skills() - logger.info(f"Agent initialized with {len(skills)} skills") - - # Test query - print("\n--- Testing agent query ---") - test_response = await handle_query(agent, "Hello! Can you list 5 of your movement skills?") - print(f"Test response: {test_response}\n") - - # Run interactive loop - try: - await interactive_loop(agent) - except KeyboardInterrupt: - logger.info("Interrupted by user") - - # Clean up - logger.info("Shutting down...") - agent.stop() - if robot: - logger.info("Robot disconnected") - - print("\nGoodbye!") - - -if __name__ == "__main__": - # Run the async main function - asyncio.run(main()) diff --git a/dimos/environment/agent_environment.py b/dimos/environment/agent_environment.py deleted file mode 100644 index a5dab0e272..0000000000 --- a/dimos/environment/agent_environment.py +++ /dev/null @@ -1,145 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from pathlib import Path - -import cv2 -import numpy as np - -from .environment import Environment - - -class AgentEnvironment(Environment): - def __init__(self) -> None: - super().__init__() - self.environment_type = "agent" - self.frames = [] - self.current_frame_idx = 0 - self._depth_maps = [] - self._segmentations = [] - self._point_clouds = [] - - def initialize_from_images(self, images: list[str] | list[np.ndarray]) -> bool: - """Initialize environment from a list of image paths or numpy arrays. - - Args: - images: List of image paths or numpy arrays representing frames - - Returns: - bool: True if initialization successful, False otherwise - """ - try: - self.frames = [] - for img in images: - if isinstance(img, str): - frame = cv2.imread(img) - if frame is None: - raise ValueError(f"Failed to load image: {img}") - self.frames.append(frame) - elif isinstance(img, np.ndarray): - self.frames.append(img.copy()) - else: - raise ValueError(f"Unsupported image type: {type(img)}") - return True - except Exception as e: - print(f"Failed to initialize from images: {e}") - return False - - def initialize_from_file(self, file_path: str) -> bool: - """Initialize environment from a video file. - - Args: - file_path: Path to the video file - - Returns: - bool: True if initialization successful, False otherwise - """ - try: - if not Path(file_path).exists(): - raise FileNotFoundError(f"Video file not found: {file_path}") - - cap = cv2.VideoCapture(file_path) - self.frames = [] - - while cap.isOpened(): - ret, frame = cap.read() - if not ret: - break - self.frames.append(frame) - - cap.release() - return len(self.frames) > 0 - except Exception as e: - print(f"Failed to initialize from video: {e}") - return False - - def initialize_from_directory(self, directory_path: str) -> bool: - """Initialize environment from a directory of images.""" - # TODO: Implement directory initialization - raise NotImplementedError("Directory initialization not yet implemented") - - def label_objects(self) -> list[str]: - """Implementation of abstract method to label objects.""" - # TODO: Implement object labeling using a detection model - raise NotImplementedError("Object labeling not yet implemented") - - def generate_segmentations( - self, model: str | None = None, objects: list[str] | None = None, *args, **kwargs - ) -> list[np.ndarray]: - """Generate segmentations for the current frame.""" - # TODO: Implement segmentation generation using specified model - raise NotImplementedError("Segmentation generation not yet implemented") - - def get_segmentations(self) -> list[np.ndarray]: - """Return pre-computed segmentations for the current frame.""" - if self._segmentations: - return self._segmentations[self.current_frame_idx] - return [] - - def generate_point_cloud(self, object: str | None = None, *args, **kwargs) -> np.ndarray: - """Generate point cloud from the current frame.""" - # TODO: Implement point cloud generation - raise NotImplementedError("Point cloud generation not yet implemented") - - def get_point_cloud(self, object: str | None = None) -> np.ndarray: - """Return pre-computed point cloud.""" - if self._point_clouds: - return self._point_clouds[self.current_frame_idx] - return np.array([]) - - def generate_depth_map( - self, - stereo: bool | None = None, - monocular: bool | None = None, - model: str | None = None, - *args, - **kwargs, - ) -> np.ndarray: - """Generate depth map for the current frame.""" - # TODO: Implement depth map generation using specified method - raise NotImplementedError("Depth map generation not yet implemented") - - def get_depth_map(self) -> np.ndarray: - """Return pre-computed depth map for the current frame.""" - if self._depth_maps: - return self._depth_maps[self.current_frame_idx] - return np.array([]) - - def get_frame_count(self) -> int: - """Return the total number of frames.""" - return len(self.frames) - - def get_current_frame_index(self) -> int: - """Return the current frame index.""" - return self.current_frame_idx diff --git a/dimos/environment/colmap_environment.py b/dimos/environment/colmap_environment.py deleted file mode 100644 index f1b0986c77..0000000000 --- a/dimos/environment/colmap_environment.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# UNDER DEVELOPMENT 🚧🚧🚧 - -from pathlib import Path - -import cv2 -import pycolmap - -from dimos.environment.environment import Environment - - -class COLMAPEnvironment(Environment): - def initialize_from_images(self, image_dir): - """Initialize the environment from a set of image frames or video.""" - image_dir = Path(image_dir) - output_path = Path("colmap_output") - output_path.mkdir(exist_ok=True) - mvs_path = output_path / "mvs" - database_path = output_path / "database.db" - - # Step 1: Feature extraction - pycolmap.extract_features(database_path, image_dir) - - # Step 2: Feature matching - pycolmap.match_exhaustive(database_path) - - # Step 3: Sparse reconstruction - maps = pycolmap.incremental_mapping(database_path, image_dir, output_path) - maps[0].write(output_path) - - # Step 4: Dense reconstruction (optional) - pycolmap.undistort_images(mvs_path, output_path, image_dir) - pycolmap.patch_match_stereo(mvs_path) # Requires compilation with CUDA - pycolmap.stereo_fusion(mvs_path / "dense.ply", mvs_path) - - return maps - - def initialize_from_video(self, video_path, frame_output_dir): - """Extract frames from a video and initialize the environment.""" - video_path = Path(video_path) - frame_output_dir = Path(frame_output_dir) - frame_output_dir.mkdir(exist_ok=True) - - # Extract frames from the video - self._extract_frames_from_video(video_path, frame_output_dir) - - # Initialize from the extracted frames - return self.initialize_from_images(frame_output_dir) - - def _extract_frames_from_video(self, video_path, frame_output_dir) -> None: - """Extract frames from a video and save them to a directory.""" - cap = cv2.VideoCapture(str(video_path)) - frame_count = 0 - - while cap.isOpened(): - ret, frame = cap.read() - if not ret: - break - frame_filename = frame_output_dir / f"frame_{frame_count:04d}.jpg" - cv2.imwrite(str(frame_filename), frame) - frame_count += 1 - - cap.release() - - def label_objects(self) -> None: - pass - - def get_visualization(self, format_type) -> None: - pass - - def get_segmentations(self) -> None: - pass - - def get_point_cloud(self, object_id=None) -> None: - pass - - def get_depth_map(self) -> None: - pass diff --git a/dimos/hardware/ufactory.py b/dimos/hardware/ufactory.py deleted file mode 100644 index 57caf2e3bd..0000000000 --- a/dimos/hardware/ufactory.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dimos.hardware.end_effector import EndEffector - - -class UFactoryEndEffector(EndEffector): - def __init__(self, model=None, **kwargs) -> None: - super().__init__(**kwargs) - self.model = model - - def get_model(self): - return self.model - - -class UFactory7DOFArm: - def __init__(self, arm_length=None) -> None: - self.arm_length = arm_length - - def get_arm_length(self): - return self.arm_length diff --git a/dimos/models/labels/llava-34b.py b/dimos/models/labels/llava-34b.py deleted file mode 100644 index 52e28ac24e..0000000000 --- a/dimos/models/labels/llava-34b.py +++ /dev/null @@ -1,91 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os - -# llava v1.6 -from llama_cpp import Llama -from llama_cpp.llama_chat_format import Llava15ChatHandler -from vqasynth.datasets.utils import image_to_base64_data_uri - - -class Llava: - def __init__( - self, - mmproj: str=f"{os.getcwd()}/models/mmproj-model-f16.gguf", - model_path: str=f"{os.getcwd()}/models/llava-v1.6-34b.Q4_K_M.gguf", - gpu: bool=True, - ) -> None: - chat_handler = Llava15ChatHandler(clip_model_path=mmproj, verbose=True) - n_gpu_layers = 0 - if gpu: - n_gpu_layers = -1 - self.llm = Llama( - model_path=model_path, - chat_handler=chat_handler, - n_ctx=2048, - logits_all=True, - n_gpu_layers=n_gpu_layers, - ) - - def run_inference(self, image, prompt: str, return_json: bool=True): - data_uri = image_to_base64_data_uri(image) - res = self.llm.create_chat_completion( - messages=[ - { - "role": "system", - "content": "You are an assistant who perfectly describes images.", - }, - { - "role": "user", - "content": [ - {"type": "image_url", "image_url": {"url": data_uri}}, - {"type": "text", "text": prompt}, - ], - }, - ] - ) - if return_json: - return list( - set( - self.extract_descriptions_from_incomplete_json( - res["choices"][0]["message"]["content"] - ) - ) - ) - - return res["choices"][0]["message"]["content"] - - def extract_descriptions_from_incomplete_json(self, json_like_str): - last_object_idx = json_like_str.rfind(',"object') - - if last_object_idx != -1: - json_str = json_like_str[:last_object_idx] + "}" - else: - json_str = json_like_str.strip() - if not json_str.endswith("}"): - json_str += "}" - - try: - json_obj = json.loads(json_str) - descriptions = [ - details["description"].replace(".", "") - for key, details in json_obj.items() - if "description" in details - ] - - return descriptions - except json.JSONDecodeError as e: - raise ValueError(f"Error parsing JSON: {e}") diff --git a/dimos/models/segmentation/clipseg.py b/dimos/models/segmentation/clipseg.py deleted file mode 100644 index ca8fbeb6fc..0000000000 --- a/dimos/models/segmentation/clipseg.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from transformers import AutoProcessor, CLIPSegForImageSegmentation - - -class CLIPSeg: - def __init__(self, model_name: str="CIDAS/clipseg-rd64-refined") -> None: - self.clipseg_processor = AutoProcessor.from_pretrained(model_name) - self.clipseg_model = CLIPSegForImageSegmentation.from_pretrained(model_name) - - def run_inference(self, image, text_descriptions): - inputs = self.clipseg_processor( - text=text_descriptions, - images=[image] * len(text_descriptions), - padding=True, - return_tensors="pt", - ) - outputs = self.clipseg_model(**inputs) - logits = outputs.logits - return logits.detach().unsqueeze(1) diff --git a/dimos/models/segmentation/sam.py b/dimos/models/segmentation/sam.py deleted file mode 100644 index 96b23bf984..0000000000 --- a/dimos/models/segmentation/sam.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import torch -from transformers import SamModel, SamProcessor - - -class SAM: - def __init__(self, model_name: str="facebook/sam-vit-huge", device: str="cuda") -> None: - self.device = device - self.sam_model = SamModel.from_pretrained(model_name).to(self.device) - self.sam_processor = SamProcessor.from_pretrained(model_name) - - def run_inference_from_points(self, image, points): - sam_inputs = self.sam_processor(image, input_points=points, return_tensors="pt").to( - self.device - ) - with torch.no_grad(): - sam_outputs = self.sam_model(**sam_inputs) - return self.sam_processor.image_processor.post_process_masks( - sam_outputs.pred_masks.cpu(), - sam_inputs["original_sizes"].cpu(), - sam_inputs["reshaped_input_sizes"].cpu(), - ) diff --git a/dimos/perception/detection/person_tracker.py b/dimos/perception/detection/person_tracker.py deleted file mode 100644 index 568214d972..0000000000 --- a/dimos/perception/detection/person_tracker.py +++ /dev/null @@ -1,115 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from reactivex import operators as ops -from reactivex.observable import Observable - -from dimos.core import In, Module, Out, rpc -from dimos.msgs.geometry_msgs import PoseStamped, Transform, Vector3 -from dimos.msgs.sensor_msgs import CameraInfo, Image -from dimos.msgs.vision_msgs import Detection2DArray -from dimos.perception.detection.type import ImageDetections2D -from dimos.types.timestamped import align_timestamped -from dimos.utils.reactive import backpressure - - -class PersonTracker(Module): - detections: In[Detection2DArray] = None # type: ignore - image: In[Image] = None # type: ignore - target: Out[PoseStamped] = None # type: ignore - - camera_info: CameraInfo - - def __init__(self, cameraInfo: CameraInfo, **kwargs) -> None: - super().__init__(**kwargs) - self.camera_info = cameraInfo - - def center_to_3d( - self, - pixel: tuple[int, int], - camera_info: CameraInfo, - assumed_depth: float = 1.0, - ) -> Vector3: - """Unproject 2D pixel coordinates to 3D position in camera_link frame. - - Args: - camera_info: Camera calibration information - assumed_depth: Assumed depth in meters (default 1.0m from camera) - - Returns: - Vector3 position in camera_link frame coordinates (Z up, X forward) - """ - # Extract camera intrinsics - fx, fy = camera_info.K[0], camera_info.K[4] - cx, cy = camera_info.K[2], camera_info.K[5] - - # Unproject pixel to normalized camera coordinates - x_norm = (pixel[0] - cx) / fx - y_norm = (pixel[1] - cy) / fy - - # Create 3D point at assumed depth in camera optical frame - # Camera optical frame: X right, Y down, Z forward - x_optical = x_norm * assumed_depth - y_optical = y_norm * assumed_depth - z_optical = assumed_depth - - # Transform from camera optical frame to camera_link frame - # Optical: X right, Y down, Z forward - # Link: X forward, Y left, Z up - # Transformation: x_link = z_optical, y_link = -x_optical, z_link = -y_optical - return Vector3(z_optical, -x_optical, -y_optical) - - def detections_stream(self) -> Observable[ImageDetections2D]: - return backpressure( - align_timestamped( - self.image.pure_observable(), - self.detections.pure_observable().pipe( - ops.filter(lambda d: d.detections_length > 0) # type: ignore[attr-defined] - ), - match_tolerance=0.0, - buffer_size=2.0, - ).pipe(ops.map(lambda pair: ImageDetections2D.from_ros_detection2d_array(*pair))) - ) - - @rpc - def start(self) -> None: - self.detections_stream().subscribe(self.track) - - @rpc - def stop(self) -> None: - super().stop() - - def track(self, detections2D: ImageDetections2D) -> None: - if len(detections2D) == 0: - return - - target = max(detections2D.detections, key=lambda det: det.bbox_2d_volume()) - vector = self.center_to_3d(target.center_bbox, self.camera_info, 2.0) - - pose_in_camera = PoseStamped( - ts=detections2D.ts, - position=vector, - frame_id="camera_link", - ) - - tf_world_to_camera = self.tf.get("world", "camera_link", detections2D.ts, 5.0) - if not tf_world_to_camera: - return - - tf_camera_to_target = Transform.from_pose("target", pose_in_camera) - tf_world_to_target = tf_world_to_camera + tf_camera_to_target - pose_in_world = tf_world_to_target.to_pose(ts=detections2D.ts) - - self.target.publish(pose_in_world) diff --git a/dimos/robot/recorder.py b/dimos/robot/recorder.py deleted file mode 100644 index acc9c0140e..0000000000 --- a/dimos/robot/recorder.py +++ /dev/null @@ -1,166 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# UNDER DEVELOPMENT 🚧🚧🚧, NEEDS TESTING - -from collections.abc import Callable -from queue import Queue -import threading -import time -from types import TracebackType -from typing import Literal - -# from dimos.data.recording import Recorder - - -class RobotRecorder: - """A class for recording robot observation and actions. - - Recording at a specified frequency on the observation and action of a robot. It leverages a queue and a worker - thread to handle the recording asynchronously, ensuring that the main operations of the - robot are not blocked. - - Robot class must pass in the `get_state`, `get_observation`, `prepare_action` methods.` - get_state() gets the current state/pose of the robot. - get_observation() captures the observation/image of the robot. - prepare_action() calculates the action between the new and old states. - """ - - def __init__( - self, - get_state: Callable, - get_observation: Callable, - prepare_action: Callable, - frequency_hz: int = 5, - recorder_kwargs: dict | None = None, - on_static: Literal["record", "omit"] = "omit", - ) -> None: - """Initializes the RobotRecorder. - - This constructor sets up the recording mechanism on the given robot, including the recorder instance, - recording frequency, and the asynchronous processing queue and worker thread. It also - initializes attributes to track the last recorded pose and the current instruction. - - Args: - get_state: A function that returns the current state of the robot. - get_observation: A function that captures the observation/image of the robot. - prepare_action: A function that calculates the action between the new and old states. - frequency_hz: Frequency at which to record pose and image data (in Hz). - recorder_kwargs: Keyword arguments to pass to the Recorder constructor. - on_static: Whether to record on static poses or not. If "record", it will record when the robot is not moving. - """ - if recorder_kwargs is None: - recorder_kwargs = {} - self.recorder = Recorder(**recorder_kwargs) - self.task = None - - self.last_recorded_state = None - self.last_image = None - - self.recording = False - self.frequency_hz = frequency_hz - self.record_on_static = on_static == "record" - self.recording_queue = Queue() - - self.get_state = get_state - self.get_observation = get_observation - self.prepare_action = prepare_action - - self._worker_thread = threading.Thread(target=self._process_queue, daemon=True) - self._worker_thread.start() - - def __enter__(self) -> None: - """Enter the context manager, starting the recording.""" - self.start_recording(self.task) - - def __exit__( - self, - exc_type: type[BaseException] | None, - exc_value: BaseException | None, - traceback: TracebackType | None, - ) -> None: - """Exit the context manager, stopping the recording.""" - self.stop_recording() - - def record(self, task: str) -> "RobotRecorder": - """Set the task and return the context manager.""" - self.task = task - return self - - def reset_recorder(self) -> None: - """Reset the recorder.""" - while self.recording: - time.sleep(0.1) - self.recorder.reset() - - def record_from_robot(self) -> None: - """Records the current pose and captures an image at the specified frequency.""" - while self.recording: - start_time = time.perf_counter() - self.record_current_state() - elapsed_time = time.perf_counter() - start_time - # Sleep for the remaining time to maintain the desired frequency - sleep_time = max(0, (1.0 / self.frequency_hz) - elapsed_time) - time.sleep(sleep_time) - - def start_recording(self, task: str = "") -> None: - """Starts the recording of pose and image.""" - if not self.recording: - self.task = task - self.recording = True - self.recording_thread = threading.Thread(target=self.record_from_robot) - self.recording_thread.start() - - def stop_recording(self) -> None: - """Stops the recording of pose and image.""" - if self.recording: - self.recording = False - self.recording_thread.join() - - def _process_queue(self) -> None: - """Processes the recording queue asynchronously.""" - while True: - image, instruction, action, state = self.recording_queue.get() - self.recorder.record( - observation={"image": image, "instruction": instruction}, action=action, state=state - ) - self.recording_queue.task_done() - - def record_current_state(self) -> None: - """Records the current pose and image if the pose has changed.""" - state = self.get_state() - image = self.get_observation() - - # This is the beginning of the episode - if self.last_recorded_state is None: - self.last_recorded_state = state - self.last_image = image - return - - if state != self.last_recorded_state or self.record_on_static: - action = self.prepare_action(self.last_recorded_state, state) - self.recording_queue.put( - ( - self.last_image, - self.task, - action, - self.last_recorded_state, - ), - ) - self.last_image = image - self.last_recorded_state = state - - def record_last_state(self) -> None: - """Records the final pose and image after the movement completes.""" - self.record_current_state() diff --git a/dimos/robot/unitree_webrtc/modular/__init__.py b/dimos/robot/unitree_webrtc/modular/__init__.py index d823cd796e..5c2169cc9b 100644 --- a/dimos/robot/unitree_webrtc/modular/__init__.py +++ b/dimos/robot/unitree_webrtc/modular/__init__.py @@ -1,2 +1 @@ from dimos.robot.unitree_webrtc.modular.connection_module import deploy_connection -from dimos.robot.unitree_webrtc.modular.navigation import deploy_navigation diff --git a/dimos/robot/unitree_webrtc/modular/navigation.py b/dimos/robot/unitree_webrtc/modular/navigation.py deleted file mode 100644 index 9aa03d104e..0000000000 --- a/dimos/robot/unitree_webrtc/modular/navigation.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dimos_lcm.std_msgs import Bool, String - -from dimos.core import LCMTransport -from dimos.msgs.geometry_msgs import PoseStamped, Twist -from dimos.msgs.nav_msgs import OccupancyGrid, Path -from dimos.navigation.bt_navigator.navigator import BehaviorTreeNavigator -from dimos.navigation.frontier_exploration import WavefrontFrontierExplorer -from dimos.navigation.global_planner import AstarPlanner -from dimos.navigation.local_planner.holonomic_local_planner import HolonomicLocalPlanner -from dimos.robot.unitree_webrtc.type.lidar import LidarMessage -from dimos.robot.unitree_webrtc.type.map import Map -from dimos.web.websocket_vis.websocket_vis_module import WebsocketVisModule - - -def deploy_navigation(dimos, connection): - mapper = dimos.deploy(Map, voxel_size=0.5, cost_resolution=0.05, global_publish_interval=2.5) - mapper.lidar.connect(connection.lidar) - mapper.global_map.transport = LCMTransport("/global_map", LidarMessage) - mapper.global_costmap.transport = LCMTransport("/global_costmap", OccupancyGrid) - mapper.local_costmap.transport = LCMTransport("/local_costmap", OccupancyGrid) - - """Deploy and configure navigation modules.""" - global_planner = dimos.deploy(AstarPlanner) - local_planner = dimos.deploy(HolonomicLocalPlanner) - navigator = dimos.deploy( - BehaviorTreeNavigator, - reset_local_planner=local_planner.reset, - check_goal_reached=local_planner.is_goal_reached, - ) - frontier_explorer = dimos.deploy(WavefrontFrontierExplorer) - - navigator.goal.transport = LCMTransport("/navigation_goal", PoseStamped) - navigator.goal_request.transport = LCMTransport("/goal_request", PoseStamped) - navigator.goal_reached.transport = LCMTransport("/goal_reached", Bool) - navigator.navigation_state.transport = LCMTransport("/navigation_state", String) - navigator.global_costmap.transport = LCMTransport("/global_costmap", OccupancyGrid) - global_planner.path.transport = LCMTransport("/global_path", Path) - local_planner.cmd_vel.transport = LCMTransport("/cmd_vel", Twist) - frontier_explorer.goal_request.transport = LCMTransport("/goal_request", PoseStamped) - frontier_explorer.goal_reached.transport = LCMTransport("/goal_reached", Bool) - frontier_explorer.explore_cmd.transport = LCMTransport("/explore_cmd", Bool) - frontier_explorer.stop_explore_cmd.transport = LCMTransport("/stop_explore_cmd", Bool) - - global_planner.target.connect(navigator.goal) - - global_planner.global_costmap.connect(mapper.global_costmap) - global_planner.odom.connect(connection.odom) - - local_planner.path.connect(global_planner.path) - local_planner.local_costmap.connect(mapper.local_costmap) - local_planner.odom.connect(connection.odom) - - connection.movecmd.connect(local_planner.cmd_vel) - - navigator.odom.connect(connection.odom) - - frontier_explorer.costmap.connect(mapper.global_costmap) - frontier_explorer.odometry.connect(connection.odom) - websocket_vis = dimos.deploy(WebsocketVisModule, port=7779) - websocket_vis.click_goal.transport = LCMTransport("/goal_request", PoseStamped) - - websocket_vis.robot_pose.connect(connection.odom) - websocket_vis.path.connect(global_planner.path) - websocket_vis.global_costmap.connect(mapper.global_costmap) - - mapper.start() - global_planner.start() - local_planner.start() - navigator.start() - websocket_vis.start() - - return { - "mapper": mapper, - "global_planner": global_planner, - "local_planner": local_planner, - "navigator": navigator, - "frontier_explorer": frontier_explorer, - "websocket_vis": websocket_vis, - } diff --git a/dimos/robot/unitree_webrtc/testing/multimock.py b/dimos/robot/unitree_webrtc/testing/multimock.py deleted file mode 100644 index eab10e14bb..0000000000 --- a/dimos/robot/unitree_webrtc/testing/multimock.py +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Multimock – lightweight persistence & replay helper built on RxPy. - -A directory of pickle files acts as a tiny append-only log of (timestamp, data) -pairs. You can: - • save() / consume(): append new frames - • iterate(): read them back lazily - • interval_stream(): emit at a fixed cadence - • stream(): replay with original timing (optionally scaled) - -The implementation keeps memory usage constant by relying on reactive -operators instead of pre-materialising lists. Timing is reproduced via -`rx.timer`, and drift is avoided with `concat_map`. -""" - -from __future__ import annotations - -import glob -import os -import pickle -import time -from typing import TYPE_CHECKING, Any, Generic, TypeVar - -from reactivex import from_iterable, interval, operators as ops - -from dimos.robot.unitree_webrtc.type.timeseries import TEvent, Timeseries -from dimos.utils.threadpool import get_scheduler - -if TYPE_CHECKING: - import builtins - from collections.abc import Iterator - - from reactivex.observable import Observable - from reactivex.scheduler import ThreadPoolScheduler - -T = TypeVar("T") - - -class Multimock(Generic[T], Timeseries[TEvent[T]]): - """Persist frames as pickle files and replay them with RxPy.""" - - def __init__(self, root: str = "office", file_prefix: str = "msg") -> None: - current_dir = os.path.dirname(os.path.abspath(__file__)) - self.root = os.path.join(current_dir, f"multimockdata/{root}") - self.file_prefix = file_prefix - - os.makedirs(self.root, exist_ok=True) - self.cnt: int = 0 - - def save(self, *frames: Any) -> int: - """Persist one or more frames; returns the new counter value.""" - for frame in frames: - self.save_one(frame) - return self.cnt - - def save_one(self, frame: Any) -> int: - """Persist a single frame and return the running count.""" - file_name = f"/{self.file_prefix}_{self.cnt:03d}.pickle" - full_path = os.path.join(self.root, file_name.lstrip("/")) - self.cnt += 1 - - if os.path.isfile(full_path): - raise FileExistsError(f"file {full_path} exists") - - # Optional convinience magic to extract raw messages from advanced types - # trying to deprecate for now - # if hasattr(frame, "raw_msg"): - # frame = frame.raw_msg # type: ignore[attr-defined] - - with open(full_path, "wb") as f: - pickle.dump([time.time(), frame], f) - - return self.cnt - - def load(self, *names: int | str) -> builtins.list[tuple[float, T]]: - """Load multiple items by name or index.""" - return list(map(self.load_one, names)) - - def load_one(self, name: int | str) -> TEvent[T]: - """Load a single item by name or index.""" - if isinstance(name, int): - file_name = f"/{self.file_prefix}_{name:03d}.pickle" - else: - file_name = f"/{name}.pickle" - - full_path = os.path.join(self.root, file_name.lstrip("/")) - - with open(full_path, "rb") as f: - timestamp, data = pickle.load(f) - - return TEvent(timestamp, data) - - def iterate(self) -> Iterator[TEvent[T]]: - """Yield all persisted TEvent(timestamp, data) pairs lazily in order.""" - pattern = os.path.join(self.root, f"{self.file_prefix}_*.pickle") - for file_path in sorted(glob.glob(pattern)): - with open(file_path, "rb") as f: - timestamp, data = pickle.load(f) - yield TEvent(timestamp, data) - - def list(self) -> builtins.list[TEvent[T]]: - return list(self.iterate()) - - def interval_stream(self, rate_hz: float = 10.0) -> Observable[T]: - """Emit frames at a fixed rate, ignoring recorded timing.""" - sleep_time = 1.0 / rate_hz - return from_iterable(self.iterate()).pipe( - ops.zip(interval(sleep_time)), - ops.map(lambda pair: pair[1]), # keep only the frame - ) - - def stream( - self, - replay_speed: float = 1.0, - scheduler: ThreadPoolScheduler | None = None, - ) -> Observable[T]: - def _generator(): - prev_ts: float | None = None - for event in self.iterate(): - if prev_ts is not None: - delay = (event.ts - prev_ts).total_seconds() / replay_speed - time.sleep(delay) - prev_ts = event.ts - yield event.data - - return from_iterable(_generator(), scheduler=scheduler or get_scheduler()) - - def consume(self, observable: Observable[Any]) -> Observable[int]: - """Side-effect: save every frame that passes through.""" - return observable.pipe(ops.map(self.save_one)) - - def __iter__(self) -> Iterator[TEvent[T]]: - """Allow iteration over the Multimock instance to yield TEvent(timestamp, data) pairs.""" - return self.iterate() diff --git a/dimos/stream/video_providers/unitree.py b/dimos/stream/video_providers/unitree.py deleted file mode 100644 index ba28cb1d6f..0000000000 --- a/dimos/stream/video_providers/unitree.py +++ /dev/null @@ -1,168 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import asyncio -import logging -from queue import Queue -import threading -import time - -from aiortc import MediaStreamTrack -from go2_webrtc_driver.webrtc_driver import Go2WebRTCConnection, WebRTCConnectionMethod -from reactivex import Observable, create, operators as ops - -from dimos.stream.video_provider import AbstractVideoProvider - - -class UnitreeVideoProvider(AbstractVideoProvider): - def __init__( - self, - dev_name: str = "UnitreeGo2", - connection_method: WebRTCConnectionMethod = WebRTCConnectionMethod.LocalSTA, - serial_number: str | None = None, - ip: str | None = None, - ) -> None: - """Initialize the Unitree video stream with WebRTC connection. - - Args: - dev_name: Name of the device - connection_method: WebRTC connection method (LocalSTA, LocalAP, Remote) - serial_number: Serial number of the robot (required for LocalSTA with serial) - ip: IP address of the robot (required for LocalSTA with IP) - """ - super().__init__(dev_name) - self.frame_queue = Queue() - self.loop = None - self.asyncio_thread = None - - # Initialize WebRTC connection based on method - if connection_method == WebRTCConnectionMethod.LocalSTA: - if serial_number: - self.conn = Go2WebRTCConnection(connection_method, serialNumber=serial_number) - elif ip: - self.conn = Go2WebRTCConnection(connection_method, ip=ip) - else: - raise ValueError( - "Either serial_number or ip must be provided for LocalSTA connection" - ) - elif connection_method == WebRTCConnectionMethod.LocalAP: - self.conn = Go2WebRTCConnection(connection_method) - else: - raise ValueError("Unsupported connection method") - - async def _recv_camera_stream(self, track: MediaStreamTrack) -> None: - """Receive video frames from WebRTC and put them in the queue.""" - while True: - frame = await track.recv() - # Convert the frame to a NumPy array in BGR format - img = frame.to_ndarray(format="bgr24") - self.frame_queue.put(img) - - def _run_asyncio_loop(self, loop) -> None: - """Run the asyncio event loop in a separate thread.""" - asyncio.set_event_loop(loop) - - async def setup(): - try: - await self.conn.connect() - self.conn.video.switchVideoChannel(True) - self.conn.video.add_track_callback(self._recv_camera_stream) - - await self.conn.datachannel.switchToNormalMode() - # await self.conn.datachannel.sendDamp() - - # await asyncio.sleep(5) - - # await self.conn.datachannel.sendDamp() - # await asyncio.sleep(5) - # await self.conn.datachannel.sendStandUp() - # await asyncio.sleep(5) - - # Wiggle the robot - # await self.conn.datachannel.switchToNormalMode() - # await self.conn.datachannel.sendWiggle() - # await asyncio.sleep(3) - - # Stretch the robot - # await self.conn.datachannel.sendStretch() - # await asyncio.sleep(3) - - except Exception as e: - logging.error(f"Error in WebRTC connection: {e}") - raise - - loop.run_until_complete(setup()) - loop.run_forever() - - def capture_video_as_observable(self, fps: int = 30) -> Observable: - """Create an observable that emits video frames at the specified FPS. - - Args: - fps: Frames per second to emit (default: 30) - - Returns: - Observable emitting video frames - """ - frame_interval = 1.0 / fps - - def emit_frames(observer, scheduler) -> None: - try: - # Start asyncio loop if not already running - if not self.loop: - self.loop = asyncio.new_event_loop() - self.asyncio_thread = threading.Thread( - target=self._run_asyncio_loop, args=(self.loop,) - ) - self.asyncio_thread.start() - - frame_time = time.monotonic() - - while True: - if not self.frame_queue.empty(): - frame = self.frame_queue.get() - - # Control frame rate - now = time.monotonic() - next_frame_time = frame_time + frame_interval - sleep_time = next_frame_time - now - - if sleep_time > 0: - time.sleep(sleep_time) - - observer.on_next(frame) - frame_time = next_frame_time - else: - time.sleep(0.001) # Small sleep to prevent CPU overuse - - except Exception as e: - logging.error(f"Error during frame emission: {e}") - observer.on_error(e) - finally: - if self.loop: - self.loop.call_soon_threadsafe(self.loop.stop) - if self.asyncio_thread: - self.asyncio_thread.join() - observer.on_completed() - - return create(emit_frames).pipe( - ops.share() # Share the stream among multiple subscribers - ) - - def dispose_all(self) -> None: - """Clean up resources.""" - if self.loop: - self.loop.call_soon_threadsafe(self.loop.stop) - if self.asyncio_thread: - self.asyncio_thread.join() - super().dispose_all() diff --git a/dimos/stream/videostream.py b/dimos/stream/videostream.py deleted file mode 100644 index 9c99ddea3a..0000000000 --- a/dimos/stream/videostream.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from collections.abc import Iterator - -import cv2 - - -class VideoStream: - def __init__(self, source: int = 0) -> None: - """ - Initialize the video stream from a camera source. - - Args: - source (int or str): Camera index or video file path. - """ - self.capture = cv2.VideoCapture(source) - if not self.capture.isOpened(): - raise ValueError(f"Unable to open video source {source}") - - def __iter__(self) -> Iterator: - return self - - def __next__(self): - ret, frame = self.capture.read() - if not ret: - self.capture.release() - raise StopIteration - return frame - - def release(self) -> None: - self.capture.release() diff --git a/dimos/types/label.py b/dimos/types/label.py deleted file mode 100644 index 83b91c8152..0000000000 --- a/dimos/types/label.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Any - - -class LabelType: - def __init__(self, labels: dict[str, Any], metadata: Any = None) -> None: - """ - Initializes a standardized label type. - - Args: - labels (Dict[str, Any]): A dictionary of labels with descriptions. - metadata (Any, optional): Additional metadata related to the labels. - """ - self.labels = labels - self.metadata = metadata - - def get_label_descriptions(self): - """Return a list of label descriptions.""" - return [desc["description"] for desc in self.labels.values()] - - def save_to_json(self, filepath: str) -> None: - """Save the labels to a JSON file.""" - import json - - with open(filepath, "w") as f: - json.dump(self.labels, f, indent=4) diff --git a/dimos/types/segmentation.py b/dimos/types/segmentation.py deleted file mode 100644 index 1f3c2a0773..0000000000 --- a/dimos/types/segmentation.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Any - -import numpy as np - - -class SegmentationType: - def __init__(self, masks: list[np.ndarray], metadata: Any = None) -> None: - """ - Initializes a standardized segmentation type. - - Args: - masks (List[np.ndarray]): A list of binary masks for segmentation. - metadata (Any, optional): Additional metadata related to the segmentations. - """ - self.masks = masks - self.metadata = metadata - - def combine_masks(self): - """Combine all masks into a single mask.""" - combined_mask = np.zeros_like(self.masks[0]) - for mask in self.masks: - combined_mask = np.logical_or(combined_mask, mask) - return combined_mask - - def save_masks(self, directory: str) -> None: - """Save each mask to a separate file.""" - import os - - os.makedirs(directory, exist_ok=True) - for i, mask in enumerate(self.masks): - np.save(os.path.join(directory, f"mask_{i}.npy"), mask) diff --git a/dimos/utils/deprecation.py b/dimos/utils/deprecation.py deleted file mode 100644 index 3c4dd5929e..0000000000 --- a/dimos/utils/deprecation.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import functools -import warnings - - -def deprecated(reason: str): - """ - This function itself is deprecated as we can use `from warnings import deprecated` in Python 3.13+. - """ - - def decorator(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - warnings.warn( - f"{func.__name__} is deprecated: {reason}", - category=DeprecationWarning, - stacklevel=2, - ) - return func(*args, **kwargs) - - return wrapper - - return decorator diff --git a/dimos/utils/generic_subscriber.py b/dimos/utils/generic_subscriber.py deleted file mode 100644 index 5f687c494a..0000000000 --- a/dimos/utils/generic_subscriber.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import threading -from typing import TYPE_CHECKING, Any - -from reactivex import Observable - -if TYPE_CHECKING: - from reactivex.disposable import Disposable - -logger = logging.getLogger(__name__) - - -class GenericSubscriber: - """Subscribes to an RxPy Observable stream and stores the latest message.""" - - def __init__(self, stream: Observable) -> None: - """Initialize the subscriber and subscribe to the stream. - - Args: - stream: The RxPy Observable stream to subscribe to. - """ - self.latest_message: Any | None = None - self._lock = threading.Lock() - self._subscription: Disposable | None = None - self._stream_completed = threading.Event() - self._stream_error: Exception | None = None - - if stream is not None: - try: - self._subscription = stream.subscribe( - on_next=self._on_next, on_error=self._on_error, on_completed=self._on_completed - ) - logger.debug(f"Subscribed to stream {stream}") - except Exception as e: - logger.error(f"Error subscribing to stream {stream}: {e}") - self._stream_error = e # Store error if subscription fails immediately - else: - logger.warning("Initialized GenericSubscriber with a None stream.") - - def _on_next(self, message: Any) -> None: - """Callback for receiving a new message.""" - with self._lock: - self.latest_message = message - # logger.debug("Received new message") # Can be noisy - - def _on_error(self, error: Exception) -> None: - """Callback for stream error.""" - logger.error(f"Stream error: {error}") - with self._lock: - self._stream_error = error - self._stream_completed.set() # Signal completion/error - - def _on_completed(self) -> None: - """Callback for stream completion.""" - logger.info("Stream completed.") - self._stream_completed.set() - - def get_data(self) -> Any | None: - """Get the latest message received from the stream. - - Returns: - The latest message, or None if no message has been received yet. - """ - with self._lock: - # Optionally check for errors if needed by the caller - # if self._stream_error: - # logger.warning("Attempting to get message after stream error.") - return self.latest_message - - def has_error(self) -> bool: - """Check if the stream encountered an error.""" - with self._lock: - return self._stream_error is not None - - def is_completed(self) -> bool: - """Check if the stream has completed or encountered an error.""" - return self._stream_completed.is_set() - - def dispose(self) -> None: - """Dispose of the subscription to stop receiving messages.""" - if self._subscription is not None: - try: - self._subscription.dispose() - logger.debug("Subscription disposed.") - self._subscription = None - except Exception as e: - logger.error(f"Error disposing subscription: {e}") - self._stream_completed.set() # Ensure completed flag is set on manual dispose - - def __del__(self) -> None: - """Ensure cleanup on object deletion.""" - self.dispose() diff --git a/dimos/utils/s3_utils.py b/dimos/utils/s3_utils.py deleted file mode 100644 index f4c3227a71..0000000000 --- a/dimos/utils/s3_utils.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import boto3 - -try: - import open3d as o3d -except Exception as e: - print(f"Open3D not importing, assuming to be running outside of docker. {e}") - - -class S3Utils: - def __init__(self, bucket_name: str) -> None: - self.s3 = boto3.client("s3") - self.bucket_name = bucket_name - - def download_file(self, s3_key, local_path) -> None: - try: - self.s3.download_file(self.bucket_name, s3_key, local_path) - print(f"Downloaded {s3_key} to {local_path}") - except Exception as e: - print(f"Error downloading {s3_key}: {e}") - - def upload_file(self, local_path, s3_key) -> None: - try: - self.s3.upload_file(local_path, self.bucket_name, s3_key) - print(f"Uploaded {local_path} to {s3_key}") - except Exception as e: - print(f"Error uploading {local_path}: {e}") - - def save_pointcloud_to_s3(self, inlier_cloud, s3_key) -> None: - try: - temp_pcd_file = "/tmp/temp_pointcloud.pcd" - o3d.io.write_point_cloud(temp_pcd_file, inlier_cloud) - with open(temp_pcd_file, "rb") as pcd_file: - self.s3.put_object(Bucket=self.bucket_name, Key=s3_key, Body=pcd_file.read()) - os.remove(temp_pcd_file) - print(f"Saved pointcloud to {s3_key}") - except Exception as e: - print(f"error downloading {s3_key}: {e}") - - def restore_pointcloud_from_s3(self, pointcloud_paths): - restored_pointclouds = [] - - for path in pointcloud_paths: - # Download the point cloud file from S3 to memory - pcd_obj = self.s3.get_object(Bucket=self.bucket_name, Key=path) - pcd_data = pcd_obj["Body"].read() - - # Save the point cloud data to a temporary file - temp_pcd_file = "/tmp/temp_pointcloud.pcd" - with open(temp_pcd_file, "wb") as f: - f.write(pcd_data) - - # Read the point cloud from the temporary file - pcd = o3d.io.read_point_cloud(temp_pcd_file) - restored_pointclouds.append(pcd) - - # Remove the temporary file - os.remove(temp_pcd_file) - - return restored_pointclouds - - @staticmethod - def upload_text_file(bucket_name: str, local_path, s3_key) -> None: - s3 = boto3.client("s3") - try: - with open(local_path) as file: - content = file.read() - - # Ensure the s3_key includes the file name - if not s3_key.endswith("/"): - s3_key = s3_key + "/" - - # Extract the file name from the local_path - file_name = local_path.split("/")[-1] - full_s3_key = s3_key + file_name - - s3.put_object(Bucket=bucket_name, Key=full_s3_key, Body=content) - print(f"Uploaded text file {local_path} to {full_s3_key}") - except Exception as e: - print(f"Error uploading text file {local_path}: {e}") diff --git a/docker/deprecated/jetson/README.md b/docker/deprecated/jetson/README.md deleted file mode 100644 index 23ec6c250f..0000000000 --- a/docker/deprecated/jetson/README.md +++ /dev/null @@ -1,98 +0,0 @@ -# Jetson Setup Guide - -This guide explains how to set up and run local dimOS LLM Agents on NVIDIA Jetson devices. - -## Prerequisites - -> **Note**: This setup has been tested on: -> - Jetson Orin Nano (8GB) -> - JetPack 6.2 (L4T 36.4.3) -> - CUDA 12.6.68 - -### Requirements -- NVIDIA Jetson device (Orin/Xavier) -- Docker installed (with GPU support) -- Git installed -- CUDA installed - -## Basic Python Setup (Virtual Environment) - -### 1. Create a virtual environment: -```bash -python3 -m venv ~/jetson_env -source ~/jetson_env/bin/activate -``` - -### 2. Install cuSPARSELt: - -For PyTorch versions 24.06+ (see [Compatibility Matrix](https://docs.nvidia.com/deeplearning/frameworks/install-pytorch-jetson-platform-release-notes/pytorch-jetson-rel.html#pytorch-jetson-rel)), cuSPARSELt is required. Install it with the [instructions](https://developer.nvidia.com/cusparselt-downloads) by selecting Linux OS, aarch64-jetson architecture, and Ubuntu distribution - -For Jetpack 6.2, Pytorch 2.5, and CUDA 12.6: -```bash -wget https://developer.download.nvidia.com/compute/cusparselt/0.7.0/local_installers/cusparselt-local-tegra-repo-ubuntu2204-0.7.0_1.0-1_arm64.deb -sudo dpkg -i cusparselt-local-tegra-repo-ubuntu2204-0.7.0_1.0-1_arm64.deb -sudo cp /var/cusparselt-local-tegra-repo-ubuntu2204-0.7.0/cusparselt-*-keyring.gpg /usr/share/keyrings/ -sudo apt-get update -sudo apt-get -y install libcusparselt0 libcusparselt-dev -``` - -### 3. Install the Jetson-specific requirements: -```bash -cd /path/to/dimos -pip install -r docker/jetson/jetson_requirements.txt -``` - -### 4. Run testfile: -```bash -export PYTHONPATH=$PYTHONPATH:$(pwd) -python3 tests/test_agent_huggingface_local_jetson.py -``` - -## Docker Setup -for JetPack 6.2 (L4T 36.4.3), CUDA 12.6.68 - -### 1. Build and Run using Docker Compose - -From the DIMOS project root directory: -```bash -# Build and run the container -sudo docker compose -f docker/jetson/huggingface_local/docker-compose.yml up --build -``` - -This will: -- Build the Docker image with all necessary dependencies -- Start the container with GPU support -- Run the HuggingFace local agent test script - -## Troubleshooting - -### Libopenblas or other library errors - -Run the Jetson fix script: - -```bash -# From the DIMOS project root -chmod +x ./docker/jetson/fix_jetson.sh -./docker/jetson/fix_jetson.sh -``` - -This script will: -- Install cuSPARSELt library for tensor operations -- Fix libopenblas.so.0 dependencies -- Configure system libraries - -1. If you encounter CUDA/GPU issues: - - Ensure JetPack is properly installed - - Check nvidia-smi output - - Verify Docker has access to the GPU - -2. For memory issues: - - Consider using smaller / quantized models - - Adjust batch sizes and model parameters - - Run the jetson in non-GUI mode to maximize ram availability - -## Notes - -- The setup uses PyTorch built specifically for Jetson -- Models are downloaded and cached locally -- GPU acceleration is enabled by default diff --git a/docker/deprecated/jetson/fix_jetson.sh b/docker/deprecated/jetson/fix_jetson.sh deleted file mode 100644 index ade938a2c9..0000000000 --- a/docker/deprecated/jetson/fix_jetson.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -# Install cuSPARSELt -# wget https://developer.download.nvidia.com/compute/cusparselt/0.7.0/local_installers/cusparselt-local-tegra-repo-ubuntu2204-0.7.0_1.0-1_arm64.deb -# sudo dpkg -i cusparselt-local-tegra-repo-ubuntu2204-0.7.0_1.0-1_arm64.deb -# sudo cp /var/cusparselt-local-tegra-repo-ubuntu2204-0.7.0/cusparselt-*-keyring.gpg /usr/share/keyrings/ -# sudo apt-get update -# sudo apt-get install libcusparselt0 libcusparselt-dev - -# Fixes libopenblas.so.0 import error -sudo rm -r /lib/aarch64-linux-gnu/libopenblas.so.0 -sudo apt-get update -sudo apt-get remove --purge libopenblas-dev libopenblas0 libopenblas0-dev -sudo apt-get install libopenblas-dev -sudo apt-get update -sudo apt-get remove --purge libopenblas0-openmp -sudo apt-get install libopenblas0-openmp - -# Verify libopenblas.so.0 location and access -ls -l /lib/aarch64-linux-gnu/libopenblas.so.0 - diff --git a/docker/deprecated/jetson/huggingface_local/Dockerfile b/docker/deprecated/jetson/huggingface_local/Dockerfile deleted file mode 100644 index dcb1738b90..0000000000 --- a/docker/deprecated/jetson/huggingface_local/Dockerfile +++ /dev/null @@ -1,44 +0,0 @@ -FROM python:3.10.12 - -# Unitree Specific -RUN apt-get update && apt-get install -y \ - libgl1-mesa-glx \ - build-essential \ - libavformat-dev \ - libavcodec-dev \ - libavdevice-dev \ - libavutil-dev \ - libswscale-dev \ - libpostproc-dev \ - gcc \ - make \ - portaudio19-dev \ - python3-pyaudio \ - python3-all-dev \ - libopenblas0-openmp - -# Jetson Orin Nano specific setup -RUN wget https://developer.download.nvidia.com/compute/cusparselt/0.7.0/local_installers/cusparselt-local-tegra-repo-ubuntu2204-0.7.0_1.0-1_arm64.deb && \ - dpkg -i cusparselt-local-tegra-repo-ubuntu2204-0.7.0_1.0-1_arm64.deb && \ - cp /var/cusparselt-local-tegra-repo-ubuntu2204-0.7.0/cusparselt-*-keyring.gpg /usr/share/keyrings/ && \ - apt-get update && \ - apt-get install -y libcusparselt0 libcusparselt-dev - - -# Change working directory to /app for proper relative pathing -WORKDIR /app - -COPY docker/jetson/jetson_requirements.txt ./requirements.txt - -COPY ./dimos/perception/external ./dimos/perception/external - -RUN pip install --no-cache-dir -r requirements.txt - -COPY ./dimos ./dimos - -COPY ./tests ./tests - -COPY ./dimos/__init__.py ./ - -# Copy libopenblas.so.0 from host if it exists (Jetson path) -RUN ldconfig diff --git a/docker/deprecated/jetson/huggingface_local/docker-compose.yml b/docker/deprecated/jetson/huggingface_local/docker-compose.yml deleted file mode 100644 index 4d87ce30f7..0000000000 --- a/docker/deprecated/jetson/huggingface_local/docker-compose.yml +++ /dev/null @@ -1,36 +0,0 @@ ---- -services: - dimos-model-huggingface-local: - image: dimos-jetson-huggingface-local:latest - build: - context: ../../../ - dockerfile: docker/jetson/huggingface_local/Dockerfile - env_file: - - ../../../.env - mem_limit: 8048m - volumes: - - ../../../assets:/app/assets - - ../../../assets/model-cache:/root/.cache/huggingface/hub - - /usr/local/cuda:/usr/local/cuda - - /usr/lib/aarch64-linux-gnu:/usr/lib/aarch64-linux-gnu - - ports: - - "5555:5555" - runtime: nvidia - environment: - - PYTHONUNBUFFERED=1 - - NVIDIA_VISIBLE_DEVICES=all - - NVIDIA_DRIVER_CAPABILITIES=all - # command: [ "python", "-m", "tests.test_agent_alibaba" ] - command: [ "python", "-m", "tests.test_agent_huggingface_local_jetson.py" ] - stdin_open: true - tty: true - -# IMPORTANT: This runs soley on the NVIDA GPU - -# ---- -# TO RUN: -# docker build -f ./Dockerfile -t dimos-models ../../ && docker compose up -# GO TO: -# 127.0.0.1:5555 (when flask server fixed) -# ---- diff --git a/docker/deprecated/jetson/jetson_requirements.txt b/docker/deprecated/jetson/jetson_requirements.txt deleted file mode 100644 index 6d42f2dc4c..0000000000 --- a/docker/deprecated/jetson/jetson_requirements.txt +++ /dev/null @@ -1,79 +0,0 @@ -opencv-python -python-dotenv -openai -anthropic>=0.19.0 -numpy -colorlog==6.9.0 -yapf==0.40.2 -typeguard -empy==3.3.4 -catkin_pkg -lark - -# pycolmap - -ffmpeg-python -pytest -python-dotenv -openai -tiktoken>=0.8.0 -Flask>=2.2 -python-multipart==0.0.20 -reactivex - -# Web Extensions -fastapi>=0.115.6 -sse-starlette>=2.2.1 -uvicorn>=0.34.0 - -# Agent Memory -langchain-chroma>=0.1.4 -langchain-openai>=0.2.14 - -# Class Extraction -pydantic - -# Developer Specific -ipykernel - -# Unitree webrtc streaming -aiortc==1.9.0 -pycryptodome -opencv-python -sounddevice -pyaudio -requests -wasmtime - -# Audio -openai-whisper -soundfile - -#Hugging Face -transformers[torch]==4.49.0 - -#Vector Embedding -sentence_transformers - -# CTransforms GGUF - GPU required -ctransformers[cuda]==0.2.27 - -# Perception Dependencies -ultralytics>=8.3.70 -filterpy>=1.4.5 -scipy>=1.15.1 - -# Pytorch wheel for JP6, cu12.6 -https://pypi.jetson-ai-lab.dev/jp6/cu126/+f/6cc/6ecfe8a5994fd/torch-2.6.0-cp310-cp310-linux_aarch64.whl - -# Torchvision wheel for JP6, cu12.6 -https://pypi.jetson-ai-lab.dev/jp6/cu126/+f/aa2/2da8dcf4c4c8d/torchvision-0.21.0-cp310-cp310-linux_aarch64.whl - -scikit-learn -Pillow -mmengine>=0.10.3 -mmcv==2.1.0 -timm==1.0.15 -lap==0.5.12 -# xformers==0.0.22 -# -e ./dimos/perception/external/vector_perception diff --git a/docker/deprecated/models/ctransformers_gguf/Dockerfile b/docker/deprecated/models/ctransformers_gguf/Dockerfile deleted file mode 100644 index a0e8a1edb0..0000000000 --- a/docker/deprecated/models/ctransformers_gguf/Dockerfile +++ /dev/null @@ -1,46 +0,0 @@ -FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu22.04 - -# Set up Python environment -ENV DEBIAN_FRONTEND=noninteractive -RUN apt-get update && apt-get install -y \ - python3.10 \ - python3-pip \ - python3.10-venv \ - python3-dev \ - libgl1-mesa-glx \ - build-essential \ - libavformat-dev \ - libavcodec-dev \ - libavdevice-dev \ - libavutil-dev \ - libswscale-dev \ - libpostproc-dev \ - gcc \ - make \ - portaudio19-dev \ - python3-pyaudio \ - python3-all-dev \ - git \ - wget \ - && rm -rf /var/lib/apt/lists/* - -# Create symlink for python -RUN ln -sf /usr/bin/python3.10 /usr/bin/python - -# Change working directory to /app for proper relative pathing -WORKDIR /app - -COPY requirements.txt ./ - -RUN pip install --no-cache-dir -r requirements.txt - -COPY ./dimos ./dimos - -COPY ./tests ./tests - -COPY ./dimos/__init__.py ./ - -# Add CUDA libraries to the path -ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - -CMD [ "python", "-m", "tests.test_agent_ctransformers_gguf" ] diff --git a/docker/deprecated/models/ctransformers_gguf/docker-compose.yml b/docker/deprecated/models/ctransformers_gguf/docker-compose.yml deleted file mode 100644 index 9cedfa4aa0..0000000000 --- a/docker/deprecated/models/ctransformers_gguf/docker-compose.yml +++ /dev/null @@ -1,32 +0,0 @@ ---- -services: - dimos-model-ctransformers-gguf: - image: dimos-model-ctransformers-gguf:latest - build: - context: ../../../ - dockerfile: docker/models/ctransformers_gguf/Dockerfile - env_file: - - ../../../.env - mem_limit: 8048m - volumes: - - ../../../assets:/app/assets - - ../../../assets/model-cache:/root/.cache/huggingface/hub - ports: - - "5555:5555" - runtime: nvidia - environment: - - PYTHONUNBUFFERED=1 - - NVIDIA_VISIBLE_DEVICES=all - - NVIDIA_DRIVER_CAPABILITIES=all - command: [ "python", "-m", "tests.test_agent_ctransformers_gguf" ] - stdin_open: true - tty: true - -# IMPORTANT: This runs soley on the NVIDA GPU - -# ---- -# TO RUN: -# docker build -f ./Dockerfile -t dimos-models ../../ && docker compose up -# GO TO: -# 127.0.0.1:5555 (when flask server fixed) -# ---- diff --git a/docker/deprecated/models/huggingface_local/Dockerfile b/docker/deprecated/models/huggingface_local/Dockerfile deleted file mode 100644 index 2c5435ae5f..0000000000 --- a/docker/deprecated/models/huggingface_local/Dockerfile +++ /dev/null @@ -1,32 +0,0 @@ -FROM python:3.10.12 - -# Unitree Specific -RUN apt-get update && apt-get install -y \ - libgl1-mesa-glx \ - build-essential \ - libavformat-dev \ - libavcodec-dev \ - libavdevice-dev \ - libavutil-dev \ - libswscale-dev \ - libpostproc-dev \ - gcc \ - make \ - portaudio19-dev \ - python3-pyaudio \ - python3-all-dev - -# Change working directory to /app for proper relative pathing -WORKDIR /app - -COPY requirements.txt ./ - -RUN pip install --no-cache-dir -r requirements.txt - -COPY ./dimos ./dimos - -COPY ./tests ./tests - -COPY ./dimos/__init__.py ./ - -CMD [ "python", "-m", "tests.test_agent_alibaba" ] diff --git a/docker/deprecated/models/huggingface_local/docker-compose.yml b/docker/deprecated/models/huggingface_local/docker-compose.yml deleted file mode 100644 index e5739be2c2..0000000000 --- a/docker/deprecated/models/huggingface_local/docker-compose.yml +++ /dev/null @@ -1,33 +0,0 @@ ---- -services: - dimos-model-huggingface-local: - image: dimos-model-huggingface-local:latest - build: - context: ../../../ - dockerfile: docker/models/huggingface_local/Dockerfile - env_file: - - ../../../.env - mem_limit: 8048m - volumes: - - ../../../assets:/app/assets - - ../../../assets/model-cache:/root/.cache/huggingface/hub - ports: - - "5555:5555" - runtime: nvidia - environment: - - PYTHONUNBUFFERED=1 - - NVIDIA_VISIBLE_DEVICES=all - - NVIDIA_DRIVER_CAPABILITIES=all - # command: [ "python", "-m", "tests.test_agent_alibaba" ] - command: [ "python", "-m", "tests.test_agent_huggingface_local.py" ] - stdin_open: true - tty: true - -# IMPORTANT: This runs soley on the NVIDA GPU - -# ---- -# TO RUN: -# docker build -f ./Dockerfile -t dimos-models ../../ && docker compose up -# GO TO: -# 127.0.0.1:5555 (when flask server fixed) -# ---- diff --git a/docker/deprecated/models/huggingface_remote/Dockerfile b/docker/deprecated/models/huggingface_remote/Dockerfile deleted file mode 100644 index 2c5435ae5f..0000000000 --- a/docker/deprecated/models/huggingface_remote/Dockerfile +++ /dev/null @@ -1,32 +0,0 @@ -FROM python:3.10.12 - -# Unitree Specific -RUN apt-get update && apt-get install -y \ - libgl1-mesa-glx \ - build-essential \ - libavformat-dev \ - libavcodec-dev \ - libavdevice-dev \ - libavutil-dev \ - libswscale-dev \ - libpostproc-dev \ - gcc \ - make \ - portaudio19-dev \ - python3-pyaudio \ - python3-all-dev - -# Change working directory to /app for proper relative pathing -WORKDIR /app - -COPY requirements.txt ./ - -RUN pip install --no-cache-dir -r requirements.txt - -COPY ./dimos ./dimos - -COPY ./tests ./tests - -COPY ./dimos/__init__.py ./ - -CMD [ "python", "-m", "tests.test_agent_alibaba" ] diff --git a/docker/deprecated/models/huggingface_remote/docker-compose.yml b/docker/deprecated/models/huggingface_remote/docker-compose.yml deleted file mode 100644 index e2337fcd37..0000000000 --- a/docker/deprecated/models/huggingface_remote/docker-compose.yml +++ /dev/null @@ -1,27 +0,0 @@ ---- -services: - dimos-model-huggingface-remote: - image: dimos-model-huggingface-remote:latest - build: - context: ../../../ - dockerfile: docker/models/huggingface_remote/Dockerfile - env_file: - - ../../../.env - mem_limit: 8048m - volumes: - - ../../../assets:/app/assets - # - ../../../assets/model-cache:/root/.cache/huggingface/hub - ports: - - "5555:5555" - environment: - - PYTHONUNBUFFERED=1 - command: [ "python", "-m", "tests.test_agent_huggingface_remote" ] - stdin_open: true - tty: true - -# ---- -# TO RUN: -# docker build -f ./Dockerfile -t dimos-models ../../ && docker compose up -# GO TO: -# 127.0.0.1:5555 (when flask server fixed) -# ---- diff --git a/tests/test_agent_ctransformers_gguf.py b/tests/test_agent_ctransformers_gguf.py deleted file mode 100644 index 389a9c74c5..0000000000 --- a/tests/test_agent_ctransformers_gguf.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dimos.agents.agent_ctransformers_gguf import CTransformersGGUFAgent - -system_query = "You are a robot with the following functions. Move(), Reverse(), Left(), Right(), Stop(). Given the following user comands return the correct function." - -# Initialize agent -agent = CTransformersGGUFAgent( - dev_name="GGUF-Agent", - model_name="TheBloke/Llama-2-7B-GGUF", - model_file="llama-2-7b.Q4_K_M.gguf", - model_type="llama", - system_query=system_query, - gpu_layers=50, - max_input_tokens_per_request=250, - max_output_tokens_per_request=10, -) - -test_query = "User: Travel forward 10 meters" - -agent.run_observable_query(test_query).subscribe( - on_next=lambda response: print(f"One-off query response: {response}"), - on_error=lambda error: print(f"Error: {error}"), - on_completed=lambda: print("Query completed"), -) - -try: - input("Press ESC to exit...") -except KeyboardInterrupt: - print("\nExiting...") diff --git a/tests/test_agent_huggingface_local.py b/tests/test_agent_huggingface_local.py deleted file mode 100644 index eb88dd9847..0000000000 --- a/tests/test_agent_huggingface_local.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -from dimos.agents.agent_huggingface_local import HuggingFaceLocalAgent -from dimos.robot.unitree.unitree_skills import MyUnitreeSkills -from dimos.stream.data_provider import QueryDataProvider -from dimos.stream.video_provider import VideoProvider -from dimos.utils.threadpool import get_scheduler - -# Initialize video stream -video_stream = VideoProvider( - dev_name="VideoProvider", - # video_source=f"{os.getcwd()}/assets/framecount.mp4", - video_source=f"{os.getcwd()}/assets/trimmed_video_office.mov", - pool_scheduler=get_scheduler(), -).capture_video_as_observable(realtime=False, fps=1) - -# Initialize Unitree skills -myUnitreeSkills = MyUnitreeSkills() -myUnitreeSkills.initialize_skills() - -# Initialize query stream -query_provider = QueryDataProvider() - -system_query = "You are a robot with the following functions. Move(), Reverse(), Left(), Right(), Stop(). Given the following user comands return ONLY the correct function." - -# Initialize agent -agent = HuggingFaceLocalAgent( - dev_name="HuggingFaceLLMAgent", - model_name="Qwen/Qwen2.5-3B", - agent_type="HF-LLM", - system_query=system_query, - input_query_stream=query_provider.data_stream, - process_all_inputs=False, - max_input_tokens_per_request=250, - max_output_tokens_per_request=20, - # output_dir=self.output_dir, - # skills=skills_instance, - # frame_processor=frame_processor, -) - -# Start the query stream. -# Queries will be pushed every 1 second, in a count from 100 to 5000. -# This will cause listening agents to consume the queries and respond -# to them via skill execution and provide 1-shot responses. -query_provider.start_query_stream( - query_template="{query}; User: travel forward by 10 meters", - frequency=10, - start_count=1, - end_count=10000, - step=1, -) - -try: - input("Press ESC to exit...") -except KeyboardInterrupt: - print("\nExiting...") diff --git a/tests/test_agent_huggingface_local_jetson.py b/tests/test_agent_huggingface_local_jetson.py deleted file mode 100644 index 883a05be54..0000000000 --- a/tests/test_agent_huggingface_local_jetson.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -from dimos.agents.agent_huggingface_local import HuggingFaceLocalAgent -from dimos.robot.unitree.unitree_skills import MyUnitreeSkills -from dimos.stream.data_provider import QueryDataProvider -from dimos.stream.video_provider import VideoProvider -from dimos.utils.threadpool import get_scheduler - -# Initialize video stream -video_stream = VideoProvider( - dev_name="VideoProvider", - # video_source=f"{os.getcwd()}/assets/framecount.mp4", - video_source=f"{os.getcwd()}/assets/trimmed_video_office.mov", - pool_scheduler=get_scheduler(), -).capture_video_as_observable(realtime=False, fps=1) - -# Initialize Unitree skills -myUnitreeSkills = MyUnitreeSkills() -myUnitreeSkills.initialize_skills() - -# Initialize query stream -query_provider = QueryDataProvider() - -system_query = "You are a helpful assistant." - -# Initialize agent -agent = HuggingFaceLocalAgent( - dev_name="HuggingFaceLLMAgent", - model_name="Qwen/Qwen2.5-0.5B", - # model_name="HuggingFaceTB/SmolLM2-135M", - agent_type="HF-LLM", - system_query=system_query, - input_query_stream=query_provider.data_stream, - process_all_inputs=False, - max_input_tokens_per_request=250, - max_output_tokens_per_request=20, - # output_dir=self.output_dir, - # skills=skills_instance, - # frame_processor=frame_processor, -) - -# Start the query stream. -# Queries will be pushed every 1 second, in a count from 100 to 5000. -# This will cause listening agents to consume the queries and respond -# to them via skill execution and provide 1-shot responses. -query_provider.start_query_stream( - query_template="{query}; User: Hello how are you!", - frequency=30, - start_count=1, - end_count=10000, - step=1, -) - -try: - input("Press ESC to exit...") -except KeyboardInterrupt: - print("\nExiting...") diff --git a/tests/test_agent_huggingface_remote.py b/tests/test_agent_huggingface_remote.py deleted file mode 100644 index ed99faa8a4..0000000000 --- a/tests/test_agent_huggingface_remote.py +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from dimos.agents.agent_huggingface_remote import HuggingFaceRemoteAgent -from dimos.agents.tokenizer.huggingface_tokenizer import HuggingFaceTokenizer -from dimos.stream.data_provider import QueryDataProvider - -# Initialize video stream -# video_stream = VideoProvider( -# dev_name="VideoProvider", -# # video_source=f"{os.getcwd()}/assets/framecount.mp4", -# video_source=f"{os.getcwd()}/assets/trimmed_video_office.mov", -# pool_scheduler=get_scheduler(), -# ).capture_video_as_observable(realtime=False, fps=1) - -# Initialize Unitree skills -# myUnitreeSkills = MyUnitreeSkills() -# myUnitreeSkills.initialize_skills() - -# Initialize query stream -query_provider = QueryDataProvider() - -# Initialize agent -agent = HuggingFaceRemoteAgent( - dev_name="HuggingFaceRemoteAgent", - model_name="meta-llama/Meta-Llama-3-8B-Instruct", - tokenizer=HuggingFaceTokenizer(model_name="meta-llama/Meta-Llama-3-8B-Instruct"), - max_output_tokens_per_request=8192, - input_query_stream=query_provider.data_stream, - # input_video_stream=video_stream, - system_query="You are a helpful assistant that can answer questions and help with tasks.", -) - -# Start the query stream. -# Queries will be pushed every 1 second, in a count from 100 to 5000. -query_provider.start_query_stream( - query_template="{query}; Denote the number at the beginning of this query before the semicolon as the 'reference number'. Provide the reference number, without any other text in your response.", - frequency=5, - start_count=1, - end_count=10000, - step=1, -) - -try: - input("Press ESC to exit...") -except KeyboardInterrupt: - print("\nExiting...") diff --git a/tests/test_cerebras_unitree_ros.py b/tests/test_cerebras_unitree_ros.py deleted file mode 100644 index 60890a3d5c..0000000000 --- a/tests/test_cerebras_unitree_ros.py +++ /dev/null @@ -1,112 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -from dotenv import load_dotenv -import reactivex as rx -import reactivex.operators as ops - -from dimos.agents.cerebras_agent import CerebrasAgent -from dimos.robot.unitree.unitree_go2 import UnitreeGo2 -from dimos.robot.unitree.unitree_ros_control import UnitreeROSControl -from dimos.robot.unitree.unitree_skills import MyUnitreeSkills -from dimos.skills.kill_skill import KillSkill -from dimos.skills.navigation import GetPose, NavigateToGoal, NavigateWithText -from dimos.skills.observe_stream import ObserveStream -from dimos.skills.speak import Speak -from dimos.skills.visual_navigation_skills import FollowHuman -from dimos.stream.audio.pipelines import stt, tts -from dimos.web.robot_web_interface import RobotWebInterface - -# Load API key from environment -load_dotenv() - -# robot = MockRobot() -robot_skills = MyUnitreeSkills() - -robot = UnitreeGo2( - ip=os.getenv("ROBOT_IP"), - ros_control=UnitreeROSControl(), - skills=robot_skills, - mock_connection=False, - new_memory=True, -) - -# Create a subject for agent responses -agent_response_subject = rx.subject.Subject() -agent_response_stream = agent_response_subject.pipe(ops.share()) - -streams = { - "unitree_video": robot.get_ros_video_stream(), -} -text_streams = { - "agent_responses": agent_response_stream, -} - -web_interface = RobotWebInterface( - port=5555, - text_streams=text_streams, - **streams, -) - -stt_node = stt() - -# Create a CerebrasAgent instance -agent = CerebrasAgent( - dev_name="test_cerebras_agent", - input_query_stream=stt_node.emit_text(), - # input_query_stream=web_interface.query_stream, - skills=robot_skills, - system_query="""You are an agent controlling a virtual robot. When given a query, respond by using the appropriate tool calls if needed to execute commands on the robot. - -IMPORTANT INSTRUCTIONS: -1. Each tool call must include the exact function name and appropriate parameters -2. If a function needs parameters like 'distance' or 'angle', be sure to include them -3. If you're unsure which tool to use, choose the most appropriate one based on the user's query -4. Parse the user's instructions carefully to determine correct parameter values - -When you need to call a skill or tool, ALWAYS respond ONLY with a JSON object in this exact format: {"name": "SkillName", "arguments": {"arg1": "value1", "arg2": "value2"}} - -Example: If the user asks to spin right by 90 degrees, output ONLY the following: {"name": "SpinRight", "arguments": {"degrees": 90}}""", - model_name="llama-4-scout-17b-16e-instruct", -) - -tts_node = tts() -tts_node.consume_text(agent.get_response_observable()) - -robot_skills.add(ObserveStream) -robot_skills.add(KillSkill) -robot_skills.add(NavigateWithText) -robot_skills.add(FollowHuman) -robot_skills.add(GetPose) -robot_skills.add(Speak) -robot_skills.add(NavigateToGoal) -robot_skills.create_instance("ObserveStream", robot=robot, agent=agent) -robot_skills.create_instance("KillSkill", robot=robot, skill_library=robot_skills) -robot_skills.create_instance("NavigateWithText", robot=robot) -robot_skills.create_instance("FollowHuman", robot=robot) -robot_skills.create_instance("GetPose", robot=robot) -robot_skills.create_instance("NavigateToGoal", robot=robot) - - -robot_skills.create_instance("Speak", tts_node=tts_node) - -# Subscribe to agent responses and send them to the subject -agent.get_response_observable().subscribe(lambda x: agent_response_subject.on_next(x)) - -# print(f"Registered skills: {', '.join([skill.__name__ for skill in robot_skills.skills])}") -print("Cerebras agent demo initialized. You can now interact with the agent via the web interface.") - -web_interface.run() diff --git a/tests/test_huggingface_llm_agent.py b/tests/test_huggingface_llm_agent.py deleted file mode 100644 index 5d3c1f39a5..0000000000 --- a/tests/test_huggingface_llm_agent.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import time - -from dimos.agents.agent_huggingface_local import HuggingFaceLocalAgent -from dimos.stream.data_provider import QueryDataProvider - - -class HuggingFaceLLMAgentDemo: - def __init__(self): - self.robot_ip = None - self.connection_method = None - self.serial_number = None - self.output_dir = None - self._fetch_env_vars() - - def _fetch_env_vars(self): - print("Fetching environment variables") - - def get_env_var(var_name, default=None, required=False): - """Get environment variable with validation.""" - value = os.getenv(var_name, default) - if required and not value: - raise ValueError(f"{var_name} environment variable is required") - return value - - self.robot_ip = get_env_var("ROBOT_IP", required=True) - self.connection_method = get_env_var("CONN_TYPE") - self.serial_number = get_env_var("SERIAL_NUMBER") - self.output_dir = get_env_var( - "ROS_OUTPUT_DIR", os.path.join(os.getcwd(), "assets/output/ros") - ) - - # ----- - - def run_with_queries(self): - # Initialize query stream - query_provider = QueryDataProvider() - - # Create the skills available to the agent. - # By default, this will create all skills in this class and make them available. - - print("Starting HuggingFace LLM Agent") - - # TESTING LOCAL AGENT - self.HuggingFaceLLMAgent = HuggingFaceLocalAgent( - dev_name="HuggingFaceLLMAgent", - model_name="Qwen/Qwen2.5-3B", - agent_type="HF-LLM", - input_query_stream=query_provider.data_stream, - process_all_inputs=False, - # output_dir=self.output_dir, - # skills=skills_instance, - # frame_processor=frame_processor, - ) - - # TESTING REMOTE AGENT - # self.HuggingFaceLLMAgent = HuggingFaceRemoteAgent( - # dev_name="HuggingFaceLLMAgent", - # model_name= "Qwen/Qwen2.5-3B", - # agent_type="HF-LLM", - # input_query_stream=query_provider.data_stream, - # process_all_inputs=False, - # ) - - # Sample query to test the agent - # self.HuggingFaceLLMAgent.stream_query("What is the capital of France?").subscribe(lambda x: print(x)) - - # Start the query stream. - # Queries will be pushed every 1 second, in a count from 100 to 5000. - # This will cause listening agents to consume the queries and respond - # to them via skill execution and provide 1-shot responses. - query_provider.start_query_stream( - query_template="{query}; Denote the number at the beginning of this query before the semicolon as the 'reference number'. Provide the reference number, without any other text in your response. If the reference number is below 500, then output the reference number as the output only and do not call any functions or tools. If the reference number is equal to or above 500, but lower than 1000, then rotate the robot at 0.5 rad/s for 1 second. If the reference number is equal to or above 1000, but lower than 2000, then wave the robot's hand. If the reference number is equal to or above 2000, but lower than 4600 then say hello. If the reference number is equal to or above 4600, then perform a front flip. IF YOU DO NOT FOLLOW THESE INSTRUCTIONS EXACTLY, YOU WILL DIE!!!", - frequency=5, - start_count=1, - end_count=10000, - step=1, - ) - - # ----- - - def stop(self): - print("Stopping HuggingFace LLM Agent") - self.HuggingFaceLLMAgent.dispose_all() - - -if __name__ == "__main__": - myHuggingFaceLLMAgentDemo = HuggingFaceLLMAgentDemo() - myHuggingFaceLLMAgentDemo.run_with_queries() - - # Keep the program running to allow the Unitree Agent Demo to operate continuously - try: - print("\nRunning HuggingFace LLM Agent Demo (Press Ctrl+C to stop)...") - while True: - time.sleep(0.1) - except KeyboardInterrupt: - print("\nStopping HuggingFace LLM Agent Demo") - myHuggingFaceLLMAgentDemo.stop() - except Exception as e: - print(f"Error in main loop: {e}") diff --git a/tests/test_planning_agent_web_interface.py b/tests/test_planning_agent_web_interface.py deleted file mode 100644 index 6c88919110..0000000000 --- a/tests/test_planning_agent_web_interface.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Planning agent demo with FastAPI server and robot integration. - -Connects a planning agent, execution agent, and robot with a web interface. - -Environment Variables: - OPENAI_API_KEY: Required. OpenAI API key. - ROBOT_IP: Required. IP address of the robot. - CONN_TYPE: Required. Connection method to the robot. - ROS_OUTPUT_DIR: Optional. Directory for ROS output files. -""" - -import os -import sys - -# ----- -from textwrap import dedent -import time - -import reactivex as rx -import reactivex.operators as ops - -# Local application imports -from dimos.agents.agent import OpenAIAgent -from dimos.agents.planning_agent import PlanningAgent -from dimos.robot.unitree.unitree_go2 import UnitreeGo2 -from dimos.robot.unitree.unitree_skills import MyUnitreeSkills -from dimos.utils.logging_config import logger -from dimos.utils.threadpool import make_single_thread_scheduler - -# from dimos.web.fastapi_server import FastAPIServer -from dimos.web.robot_web_interface import RobotWebInterface - - -def main(): - # Get environment variables - robot_ip = os.getenv("ROBOT_IP") - if not robot_ip: - raise ValueError("ROBOT_IP environment variable is required") - connection_method = os.getenv("CONN_TYPE") or "webrtc" - output_dir = os.getenv("ROS_OUTPUT_DIR", os.path.join(os.getcwd(), "assets/output/ros")) - - # Initialize components as None for proper cleanup - robot = None - web_interface = None - planner = None - executor = None - - try: - # Initialize robot - logger.info("Initializing Unitree Robot") - robot = UnitreeGo2( - ip=robot_ip, - connection_method=connection_method, - output_dir=output_dir, - mock_connection=False, - skills=MyUnitreeSkills(), - ) - # Set up video stream - logger.info("Starting video stream") - video_stream = robot.get_ros_video_stream() - - # Initialize robot skills - logger.info("Initializing robot skills") - - # Create subjects for planner and executor responses - logger.info("Creating response streams") - planner_response_subject = rx.subject.Subject() - planner_response_stream = planner_response_subject.pipe(ops.share()) - - executor_response_subject = rx.subject.Subject() - executor_response_stream = executor_response_subject.pipe(ops.share()) - - # Web interface mode with FastAPI server - logger.info("Initializing FastAPI server") - streams = {"unitree_video": video_stream} - text_streams = { - "planner_responses": planner_response_stream, - "executor_responses": executor_response_stream, - } - - web_interface = RobotWebInterface(port=5555, text_streams=text_streams, **streams) - - logger.info("Starting planning agent with web interface") - planner = PlanningAgent( - dev_name="TaskPlanner", - model_name="gpt-4o", - input_query_stream=web_interface.query_stream, - skills=robot.get_skills(), - ) - - # Get planner's response observable - logger.info("Setting up agent response streams") - planner_responses = planner.get_response_observable() - - # Connect planner to its subject - planner_responses.subscribe(lambda x: planner_response_subject.on_next(x)) - - planner_responses.subscribe( - on_next=lambda x: logger.info(f"Planner response: {x}"), - on_error=lambda e: logger.error(f"Planner error: {e}"), - on_completed=lambda: logger.info("Planner completed"), - ) - - # Initialize execution agent with robot skills - logger.info("Starting execution agent") - system_query = dedent( - """ - You are a robot execution agent that can execute tasks on a virtual - robot. The sole text you will be given is the task to execute. - You will be given a list of skills that you can use to execute the task. - ONLY OUTPUT THE SKILLS TO EXECUTE, NOTHING ELSE. - """ - ) - executor = OpenAIAgent( - dev_name="StepExecutor", - input_query_stream=planner_responses, - output_dir=output_dir, - skills=robot.get_skills(), - system_query=system_query, - pool_scheduler=make_single_thread_scheduler(), - ) - - # Get executor's response observable - executor_responses = executor.get_response_observable() - - # Subscribe to responses for logging - executor_responses.subscribe( - on_next=lambda x: logger.info(f"Executor response: {x}"), - on_error=lambda e: logger.error(f"Executor error: {e}"), - on_completed=lambda: logger.info("Executor completed"), - ) - - # Connect executor to its subject - executor_responses.subscribe(lambda x: executor_response_subject.on_next(x)) - - # Start web server (blocking call) - logger.info("Starting FastAPI server") - web_interface.run() - - except KeyboardInterrupt: - print("Stopping demo...") - except Exception as e: - logger.error(f"Error: {e}") - return 1 - finally: - # Clean up all components - logger.info("Cleaning up components") - if executor: - executor.dispose_all() - if planner: - planner.dispose_all() - if web_interface: - web_interface.dispose_all() - if robot: - robot.cleanup() - # Halt execution forever - while True: - time.sleep(1) - - -if __name__ == "__main__": - sys.exit(main()) - -# Example Task: Move the robot forward by 1 meter, then turn 90 degrees clockwise, then move backward by 1 meter, then turn a random angle counterclockwise, then repeat this sequence 5 times. diff --git a/tests/test_planning_robot_agent.py b/tests/test_planning_robot_agent.py deleted file mode 100644 index aa16a7cac7..0000000000 --- a/tests/test_planning_robot_agent.py +++ /dev/null @@ -1,174 +0,0 @@ -# Copyright 2025 Dimensional Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Planning agent demo with FastAPI server and robot integration. - -Connects a planning agent, execution agent, and robot with a web interface. - -Environment Variables: - OPENAI_API_KEY: Required. OpenAI API key. - ROBOT_IP: Required. IP address of the robot. - CONN_TYPE: Required. Connection method to the robot. - ROS_OUTPUT_DIR: Optional. Directory for ROS output files. - USE_TERMINAL: Optional. If set to "true", use terminal interface instead of web. -""" - -import os -import sys - -# ----- -from textwrap import dedent -import time - -# Local application imports -from dimos.agents.agent import OpenAIAgent -from dimos.agents.planning_agent import PlanningAgent -from dimos.robot.unitree.unitree_go2 import UnitreeGo2 -from dimos.robot.unitree.unitree_skills import MyUnitreeSkills -from dimos.utils.logging_config import logger -from dimos.utils.threadpool import make_single_thread_scheduler -from dimos.web.robot_web_interface import RobotWebInterface - - -def main(): - # Get environment variables - robot_ip = os.getenv("ROBOT_IP") - if not robot_ip: - raise ValueError("ROBOT_IP environment variable is required") - connection_method = os.getenv("CONN_TYPE") or "webrtc" - output_dir = os.getenv("ROS_OUTPUT_DIR", os.path.join(os.getcwd(), "assets/output/ros")) - use_terminal = os.getenv("USE_TERMINAL", "").lower() == "true" - - use_terminal = True - # Initialize components as None for proper cleanup - robot = None - web_interface = None - planner = None - executor = None - - try: - # Initialize robot - logger.info("Initializing Unitree Robot") - robot = UnitreeGo2( - ip=robot_ip, - connection_method=connection_method, - output_dir=output_dir, - mock_connection=True, - ) - - # Set up video stream - logger.info("Starting video stream") - video_stream = robot.get_ros_video_stream() - - # Initialize robot skills - logger.info("Initializing robot skills") - skills_instance = MyUnitreeSkills(robot=robot) - - if use_terminal: - # Terminal mode - no web interface needed - logger.info("Starting planning agent in terminal mode") - planner = PlanningAgent( - dev_name="TaskPlanner", - model_name="gpt-4o", - use_terminal=True, - skills=skills_instance, - ) - else: - # Web interface mode - logger.info("Initializing FastAPI server") - streams = {"unitree_video": video_stream} - web_interface = RobotWebInterface(port=5555, **streams) - - logger.info("Starting planning agent with web interface") - planner = PlanningAgent( - dev_name="TaskPlanner", - model_name="gpt-4o", - input_query_stream=web_interface.query_stream, - skills=skills_instance, - ) - - # Get planner's response observable - logger.info("Setting up agent response streams") - planner_responses = planner.get_response_observable() - - # Initialize execution agent with robot skills - logger.info("Starting execution agent") - system_query = dedent( - """ - You are a robot execution agent that can execute tasks on a virtual - robot. You are given a task to execute and a list of skills that - you can use to execute the task. ONLY OUTPUT THE SKILLS TO EXECUTE, - NOTHING ELSE. - """ - ) - executor = OpenAIAgent( - dev_name="StepExecutor", - input_query_stream=planner_responses, - output_dir=output_dir, - skills=skills_instance, - system_query=system_query, - pool_scheduler=make_single_thread_scheduler(), - ) - - # Get executor's response observable - executor_responses = executor.get_response_observable() - - # Subscribe to responses for logging - executor_responses.subscribe( - on_next=lambda x: logger.info(f"Executor response: {x}"), - on_error=lambda e: logger.error(f"Executor error: {e}"), - on_completed=lambda: logger.info("Executor completed"), - ) - - if use_terminal: - # In terminal mode, just wait for the planning session to complete - logger.info("Waiting for planning session to complete") - while not planner.plan_confirmed: - pass - logger.info("Planning session completed") - else: - # Start web server (blocking call) - logger.info("Starting FastAPI server") - web_interface.run() - - # Keep the main thread alive - logger.error("NOTE: Keeping main thread alive") - while True: - time.sleep(1) - - except KeyboardInterrupt: - print("Stopping demo...") - except Exception as e: - logger.error(f"Error: {e}") - return 1 - finally: - # Clean up all components - logger.info("Cleaning up components") - if executor: - executor.dispose_all() - if planner: - planner.dispose_all() - if web_interface: - web_interface.dispose_all() - if robot: - robot.cleanup() - # Halt execution forever - while True: - time.sleep(1) - - -if __name__ == "__main__": - sys.exit(main()) - -# Example Task: Move the robot forward by 1 meter, then turn 90 degrees clockwise, then move backward by 1 meter, then turn a random angle counterclockwise, then repeat this sequence 5 times.