diff --git a/MULTIMODAL_UPDATES.md b/MULTIMODAL_UPDATES.md new file mode 100644 index 0000000..8dfa90f --- /dev/null +++ b/MULTIMODAL_UPDATES.md @@ -0,0 +1,202 @@ +# Multimodal Driver Support - Node SDK Updates + +This update adds comprehensive multimodal support to the Node SDK to match the new agi-driver capabilities. + +## Changes Made + +### Protocol Updates (`src/driver/protocol.ts`) + +#### New Event Types +- `AudioTranscriptEvent`: Audio transcript from buffer +- `VideoFrameEvent`: Video frame from camera/screen +- `SpeechStartedEvent`: TTS playback started +- `SpeechFinishedEvent`: TTS playback finished +- `TurnDetectedEvent`: Voice turn detection + +#### New Command Types +- `GetAudioTranscriptCommand`: Request audio transcript +- `GetVideoFrameCommand`: Request video frame + +#### New Interfaces +- `MCPServerConfig`: MCP server configuration +- `AgentIdentity`: Agent identity information +- `ToolChoice`: Tool choice configuration type + +#### Updated StartCommand +Added fields for multimodal configuration: +- `agent_identity?: AgentIdentity` - Agent identity (default: agi-2-claude by AGI Company) +- `tool_choice?: ToolChoice` - Tool choice mode +- `mcp_servers?: MCPServerConfig[]` - MCP server configurations +- `audio_input_enabled?: boolean`, `audio_buffer_seconds?: number` +- `turn_detection_enabled?: boolean`, `turn_detection_silence_ms?: number` +- `speech_output_enabled?: boolean`, `speech_voice?: string` +- `camera_enabled?: boolean`, `camera_buffer_seconds?: number` +- `screen_recording_enabled?: boolean`, `screen_recording_buffer_seconds?: number` + +### Exports (`src/driver/index.ts`) +Added exports for all new event and command types, plus helper interfaces. + +## Usage Examples + +### Basic Multimodal Session + +```typescript +import { AgentDriver } from '@agi-inc/agi-node'; + +const driver = new AgentDriver({ + mode: 'local', + agent_name: 'agi-2-claude' +}); + +// Start with multimodal features +await driver.start({ + goal: 'Help me with my computer', + mode: 'local', + agent_name: 'agi-2-claude', + + // Voice features + audio_input_enabled: true, + turn_detection_enabled: true, + speech_output_enabled: true, + speech_voice: 'alloy', + + // Video features + camera_enabled: true, + screen_recording_enabled: true, + + // MCP servers + mcp_servers: [ + { + name: 'filesystem', + command: 'npx', + args: ['-y', '@modelcontextprotocol/server-filesystem', '/path/to/dir'], + env: {} + } + ], + + // Tool choice + tool_choice: 'auto' +}); +``` + +### Handling New Events + +```typescript +driver.on('audio_transcript', (event: AudioTranscriptEvent) => { + console.log(`Transcript: ${event.transcript}`); +}); + +driver.on('video_frame', (event: VideoFrameEvent) => { + // event.frame_base64 contains JPEG frame + saveFrame(event.frame_base64); +}); + +driver.on('speech_started', (event: SpeechStartedEvent) => { + console.log(`🔊 Speaking: ${event.text}`); +}); + +driver.on('speech_finished', () => { + console.log('✓ Finished speaking'); +}); + +driver.on('turn_detected', (event: TurnDetectedEvent) => { + console.log(`You said: ${event.transcript}`); +}); +``` + +### Voice-Only Mode + +```typescript +await driver.start({ + goal: '(voice input)', + mode: 'local', + audio_input_enabled: true, + turn_detection_enabled: true, + turn_detection_silence_ms: 1000, // 1 second of silence = turn complete + speech_output_enabled: true, + speech_voice: 'alloy' // or: echo, fable, onyx, nova, shimmer +}); +``` + +### MCP Servers + +```typescript +const mcpServers: MCPServerConfig[] = [ + { + name: 'filesystem', + command: 'npx', + args: ['-y', '@modelcontextprotocol/server-filesystem', '/Users/you/Documents'] + }, + { + name: 'database', + command: 'python', + args: ['-m', 'my_db_server'], + env: { DATABASE_URL: 'postgresql://...' } + } +]; + +await driver.start({ + goal: 'Analyze my documents', + mode: 'local', + mcp_servers: mcpServers +}); +``` + +### Tool Choice Configuration + +```typescript +// Auto (default) +tool_choice: 'auto' + +// Required - must use at least one tool +tool_choice: 'required' + +// None - no tool use +tool_choice: 'none' + +// Specific tool +tool_choice: { type: 'tool', name: 'filesystem__read_file' } +``` + +## Breaking Changes + +⚠️ This is a breaking change with no backwards compatibility. + +- `StartCommand` interface has many new optional fields +- New event types may be emitted +- `agent_name` should be set to `"agi-2-claude"` for new agents + +## Testing + +```bash +# Install updated SDK +npm install + +# Build TypeScript +npm run build + +# Run tests +npm test + +# Try a voice session +node -e " +const { AgentDriver } = require('./dist'); + +(async () => { + const driver = new AgentDriver({ mode: 'local' }); + const result = await driver.start({ + goal: 'Test voice', + mode: 'local', + audio_input_enabled: true, + speech_output_enabled: true + }); + console.log(result); +})(); +" +``` + +## Related PRs + +- agi-api (driver): https://github.com/agi-inc/agents/pull/344 +- agi-python: https://github.com/agi-inc/agi-python/pull/8 +- agi-csharp: TBD diff --git a/package-lock.json b/package-lock.json index 23ef233..b7a4e5d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -33,18 +33,6 @@ "@agi/agi-win32-x64": "0.4.0" } }, - "node_modules/@agi/agi-darwin-arm64": { - "optional": true - }, - "node_modules/@agi/agi-darwin-x64": { - "optional": true - }, - "node_modules/@agi/agi-linux-x64": { - "optional": true - }, - "node_modules/@agi/agi-win32-x64": { - "optional": true - }, "node_modules/@ampproject/remapping": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.3.0.tgz", diff --git a/package.json b/package.json index f02584c..2c3d59a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@agi_inc/agi-js", - "version": "0.4.2", + "version": "0.5.0", "description": "Official TypeScript/JavaScript SDK for AGI.tech API", "main": "./dist/index.js", "module": "./dist/index.mjs", diff --git a/src/driver/driver.ts b/src/driver/driver.ts index 37b3ac6..950cb0a 100644 --- a/src/driver/driver.ts +++ b/src/driver/driver.ts @@ -8,6 +8,9 @@ import { spawn, ChildProcess } from 'child_process'; import { EventEmitter } from 'events'; import { createInterface, Interface } from 'readline'; +import { readFileSync, existsSync } from 'fs'; +import { resolve } from 'path'; +import { homedir } from 'os'; import { findBinaryPath } from './binary'; import { DriverEvent, @@ -34,7 +37,7 @@ export interface DriverOptions { /** Platform type (default: 'desktop') */ platform?: 'desktop' | 'android'; /** "local" for autonomous mode, "remote" for managed VM, "" for legacy SDK-driven mode */ - mode?: string; + mode?: '' | 'local' | 'remote'; /** Agent name for the AGI API (e.g., "agi-2-claude") */ agentName?: string; /** AGI API base URL (default: "https://api.agi.tech") */ @@ -43,6 +46,18 @@ export interface DriverOptions { environmentType?: string; /** Environment variables to pass to the driver process */ env?: Record; + + // Multimodal options + /** Enable voice input/output */ + voice?: boolean; + /** Enable camera video feed */ + camera?: boolean; + /** Enable screen recording */ + screen?: boolean; + /** Enable MCP servers */ + mcp?: boolean; + /** Path to MCP config file */ + mcpConfig?: string; } /** @@ -90,12 +105,19 @@ export class AgentDriver extends EventEmitter { private readonly binaryPath: string; private readonly model: string; private readonly platform: 'desktop' | 'android'; - private readonly mode: string; + private readonly mode: '' | 'local' | 'remote'; private readonly agentName: string; private readonly apiUrl: string; private readonly environmentType: string; private readonly env: Record; + // Multimodal options + private readonly voice: boolean; + private readonly camera: boolean; + private readonly screen: boolean; + private readonly mcp: boolean; + private readonly mcpConfig: string; + private process: ChildProcess | null = null; private readline: Interface | null = null; private state: DriverState = 'idle'; @@ -111,6 +133,38 @@ export class AgentDriver extends EventEmitter { private pendingConfirm: ((approved: boolean, message?: string) => void) | null = null; private pendingAnswer: ((text: string) => void) | null = null; + /** + * Load MCP server configuration from file. + * @param configPath - Path to MCP config file (supports ~ expansion) + * @returns Array of MCP server configs, or undefined if file doesn't exist + */ + private loadMcpConfig(configPath: string): any[] | undefined { + try { + // Expand ~ to home directory + const expandedPath = configPath.startsWith('~') + ? resolve(homedir(), configPath.slice(2)) + : resolve(configPath); + + if (!existsSync(expandedPath)) { + return undefined; + } + + const content = readFileSync(expandedPath, 'utf-8'); + const config = JSON.parse(content); + + // Convert config object to array of MCPServerConfig + return Object.entries(config).map(([name, serverConfig]: [string, any]) => ({ + name, + command: serverConfig.command, + args: serverConfig.args || [], + env: serverConfig.env || {}, + })); + } catch (error) { + // If config loading fails, return undefined (MCP will be disabled) + return undefined; + } + } + constructor(options: DriverOptions = {}) { super(); @@ -123,6 +177,13 @@ export class AgentDriver extends EventEmitter { this.apiUrl = options.apiUrl ?? ''; this.environmentType = options.environmentType ?? ''; this.env = options.env ?? {}; + + // Multimodal options + this.voice = options.voice ?? false; + this.camera = options.camera ?? false; + this.screen = options.screen ?? false; + this.mcp = options.mcp ?? false; + this.mcpConfig = options.mcpConfig ?? '~/.agi/mcp.json'; } /** @@ -168,7 +229,7 @@ export class AgentDriver extends EventEmitter { screenshot: string = '', screenWidth: number = 0, screenHeight: number = 0, - mode?: string + mode?: '' | 'local' | 'remote' ): Promise { if (this.process) { throw new Error('Driver is already running'); @@ -248,6 +309,15 @@ export class AgentDriver extends EventEmitter { agent_name: this.agentName || undefined, api_url: this.apiUrl || undefined, environment_type: this.environmentType || undefined, + + // Multimodal options + audio_input_enabled: this.voice, + turn_detection_enabled: this.voice, + speech_output_enabled: this.voice, + speech_voice: this.voice ? 'alloy' : undefined, + camera_enabled: this.camera, + screen_recording_enabled: this.screen, + mcp_servers: this.mcp ? this.loadMcpConfig(this.mcpConfig) : undefined, }; this.sendCommand(startCmd); }); @@ -464,6 +534,26 @@ export class AgentDriver extends EventEmitter { this.emit('session_created', event); break; + case 'audio_transcript': + this.emit('audio_transcript', event); + break; + + case 'video_frame': + this.emit('video_frame', event); + break; + + case 'speech_started': + this.emit('speech_started', event); + break; + + case 'speech_finished': + this.emit('speech_finished', event); + break; + + case 'turn_detected': + this.emit('turn_detected', event); + break; + case 'finished': this.handleFinished(event); break; diff --git a/src/driver/index.ts b/src/driver/index.ts index c51431f..4909820 100644 --- a/src/driver/index.ts +++ b/src/driver/index.ts @@ -21,6 +21,11 @@ export { type ErrorEvent, type ScreenshotCapturedEvent, type SessionCreatedEvent, + type AudioTranscriptEvent, + type VideoFrameEvent, + type SpeechStartedEvent, + type SpeechFinishedEvent, + type TurnDetectedEvent, type StartCommand, type ScreenshotCommand, type PauseCommand, @@ -28,6 +33,11 @@ export { type StopCommand, type ConfirmResponseCommand, type AnswerCommand, + type GetAudioTranscriptCommand, + type GetVideoFrameCommand, + type MCPServerConfig, + type AgentIdentity, + type ToolChoice, } from './protocol'; export { findBinaryPath, isBinaryAvailable, getPlatformId, type PlatformId } from './binary'; diff --git a/src/driver/protocol.ts b/src/driver/protocol.ts index c55c20e..c3feb98 100644 --- a/src/driver/protocol.ts +++ b/src/driver/protocol.ts @@ -17,7 +17,12 @@ export type EventType = | 'finished' | 'error' | 'screenshot_captured' - | 'session_created'; + | 'session_created' + | 'audio_transcript' + | 'video_frame' + | 'speech_started' + | 'speech_finished' + | 'turn_detected'; // Command types export type CommandType = @@ -27,7 +32,9 @@ export type CommandType = | 'resume' | 'stop' | 'confirm' - | 'answer'; + | 'answer' + | 'get_audio_transcript' + | 'get_video_frame'; // Driver states export type DriverState = @@ -114,6 +121,49 @@ export interface SessionCreatedEvent extends BaseEvent { vnc_url?: string; } +/** + * Emitted when audio transcript is available. + */ +export interface AudioTranscriptEvent extends BaseEvent { + event: 'audio_transcript'; + transcript: string; + seconds_ago: number; + duration: number; +} + +/** + * Emitted when video frame is available. + */ +export interface VideoFrameEvent extends BaseEvent { + event: 'video_frame'; + frame_base64: string; + source: 'camera' | 'screen'; + seconds_ago: number; +} + +/** + * Emitted when TTS speech starts playing. + */ +export interface SpeechStartedEvent extends BaseEvent { + event: 'speech_started'; + text: string; +} + +/** + * Emitted when TTS speech finishes playing. + */ +export interface SpeechFinishedEvent extends BaseEvent { + event: 'speech_finished'; +} + +/** + * Emitted when turn detection detects user has stopped speaking. + */ +export interface TurnDetectedEvent extends BaseEvent { + event: 'turn_detected'; + transcript: string; +} + export type DriverEvent = | ReadyEvent | StateChangeEvent @@ -124,7 +174,12 @@ export type DriverEvent = | FinishedEvent | ErrorEvent | ScreenshotCapturedEvent - | SessionCreatedEvent; + | SessionCreatedEvent + | AudioTranscriptEvent + | VideoFrameEvent + | SpeechStartedEvent + | SpeechFinishedEvent + | TurnDetectedEvent; // Action type from the driver export interface DriverAction { @@ -134,6 +189,24 @@ export interface DriverAction { [key: string]: unknown; } +// MCP server configuration +export interface MCPServerConfig { + name: string; + command: string; + args: string[]; + env?: Record; +} + +// Agent identity +export interface AgentIdentity { + name: string; + creator: string; + creator_url: string; +} + +// Tool choice configuration +export type ToolChoice = 'auto' | 'required' | 'none' | { type: 'tool'; name: string }; + // Base command interface export interface BaseCommand { command: CommandType; @@ -148,14 +221,25 @@ export interface StartCommand extends BaseCommand { screen_height: number; platform: 'desktop' | 'android'; model: string; - /** "local" for autonomous mode, "remote" for managed VM, "" for legacy SDK-driven mode */ - mode?: string; - /** Agent name for the AGI API (e.g., "agi-2-claude") */ + mode: '' | 'local' | 'remote'; agent_name?: string; - /** AGI API base URL (default: "https://api.agi.tech") */ api_url?: string; - /** Environment type for remote mode ("ubuntu-1" or "chrome-1") */ environment_type?: string; + + // Multimodal features + agent_identity?: AgentIdentity; + tool_choice?: ToolChoice; + mcp_servers?: MCPServerConfig[]; + audio_input_enabled?: boolean; + audio_buffer_seconds?: number; + turn_detection_enabled?: boolean; + turn_detection_silence_ms?: number; + speech_output_enabled?: boolean; + speech_voice?: 'alloy' | 'echo' | 'fable' | 'onyx' | 'nova' | 'shimmer'; + camera_enabled?: boolean; + camera_buffer_seconds?: number; + screen_recording_enabled?: boolean; + screen_recording_buffer_seconds?: number; } export interface ScreenshotCommand extends BaseCommand { @@ -190,6 +274,18 @@ export interface AnswerCommand extends BaseCommand { question_id?: string; } +export interface GetAudioTranscriptCommand extends BaseCommand { + command: 'get_audio_transcript'; + seconds_ago: number; + duration: number; +} + +export interface GetVideoFrameCommand extends BaseCommand { + command: 'get_video_frame'; + source: 'camera' | 'screen'; + seconds_ago: number; +} + export type DriverCommand = | StartCommand | ScreenshotCommand @@ -197,7 +293,9 @@ export type DriverCommand = | ResumeCommand | StopCommand | ConfirmResponseCommand - | AnswerCommand; + | AnswerCommand + | GetAudioTranscriptCommand + | GetVideoFrameCommand; /** * Parse a JSON line into a DriverEvent.