diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b61d463d593..7d9f645814d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -276,5 +276,19 @@ target_link_libraries(test-frameforge-new-features PRIVATE common) add_test(NAME test-frameforge-new-features COMMAND $ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/tests) set_property(TEST test-frameforge-new-features PROPERTY LABELS "main") +# FrameForge Audio Capture Test +add_executable(test-frameforge-audio test-frameforge-audio.cpp + ${CMAKE_SOURCE_DIR}/tools/frameforge/frameforge-audio.cpp) +target_include_directories(test-frameforge-audio PRIVATE ${CMAKE_SOURCE_DIR}/tools/frameforge ${CMAKE_SOURCE_DIR}/vendor) +target_link_libraries(test-frameforge-audio PRIVATE common) +# Add PortAudio if available +if(PORTAUDIO_FOUND) + target_include_directories(test-frameforge-audio PRIVATE ${PORTAUDIO_INCLUDE_DIRS}) + target_link_libraries(test-frameforge-audio PRIVATE ${PORTAUDIO_LIBRARIES}) + target_compile_definitions(test-frameforge-audio PRIVATE FRAMEFORGE_PORTAUDIO_SUPPORT) +endif() +add_test(NAME test-frameforge-audio COMMAND $ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/tests) +set_property(TEST test-frameforge-audio PROPERTY LABELS "main") + llama_build_and_test(test-alloc.cpp) target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) diff --git a/tests/test-frameforge-audio.cpp b/tests/test-frameforge-audio.cpp new file mode 100644 index 00000000000..78c95945397 --- /dev/null +++ b/tests/test-frameforge-audio.cpp @@ -0,0 +1,127 @@ +#include "../tools/frameforge/frameforge-audio.h" + +#include +#include +#include +#include + +using namespace frameforge; + +static void test_audio_config() { + std::cout << "Testing audio configuration..." << std::endl; + + AudioConfig config; + assert(config.sample_rate == 16000); + assert(config.channels == 1); + assert(config.frames_per_buffer == 512); + + // Custom config + AudioConfig custom; + custom.sample_rate = 44100; + custom.channels = 2; + custom.frames_per_buffer = 1024; + + assert(custom.sample_rate == 44100); + assert(custom.channels == 2); + assert(custom.frames_per_buffer == 1024); + + std::cout << " ✓ Audio configuration passed" << std::endl; +} + +static void test_audio_capture_initialization() { + std::cout << "Testing audio capture initialization..." << std::endl; + + AudioConfig config; + AudioCapture capture(config); + + // Just test that we can create an instance + assert(!capture.is_capturing()); + +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT + std::cout << " PortAudio support is available" << std::endl; + + // Try to initialize + bool init_result = capture.initialize(); + if (init_result) { + std::cout << " ✓ Audio capture initialization succeeded" << std::endl; + + // Follow typical usage pattern: start capturing before testing buffer operations. + capture.start(); + + // Test buffer operations + capture.clear_buffer(); + std::vector buffer = capture.get_audio_buffer(); + assert(buffer.empty()); + std::cout << " ✓ Buffer operations work" << std::endl; + + // Stop capturing to complete the typical lifecycle. + capture.stop(); + } else { + std::cout << " ! Audio capture initialization failed (this is OK if no audio device is available)" << std::endl; + } +#else + std::cout << " PortAudio support is not available" << std::endl; + std::cout << " ✓ Stub implementation works" << std::endl; +#endif +} + +static void test_audio_callback() { + std::cout << "Testing audio callback..." << std::endl; + +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT + AudioConfig config; + AudioCapture capture(config); + + bool callback_called = false; + capture.set_callback([&callback_called](const std::vector & data) { + callback_called = true; + std::cout << " Callback received " << data.size() << " samples" << std::endl; + }); + + if (capture.initialize()) { + if (capture.start()) { + std::cout << " Audio capture started, waiting for callback..." << std::endl; + + // Wait for a short time to see if we get audio data + auto start_time = std::chrono::steady_clock::now(); + while (!callback_called) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_time + ).count(); + + if (elapsed > 2) { + std::cout << " ! No callback received after 2 seconds (no audio input?)" << std::endl; + break; + } + } + + capture.stop(); + + if (callback_called) { + std::cout << " ✓ Audio callback test passed" << std::endl; + } else { + std::cout << " ! Audio callback test completed (no audio detected)" << std::endl; + } + } else { + std::cout << " ! Could not start audio capture" << std::endl; + } + } else { + std::cout << " ! Could not initialize audio capture" << std::endl; + } +#else + std::cout << " PortAudio support not available, skipping callback test" << std::endl; +#endif +} + +int main() { + std::cout << "Running FrameForge Audio Capture Tests" << std::endl; + std::cout << "======================================" << std::endl; + + test_audio_config(); + test_audio_capture_initialization(); + test_audio_callback(); + + std::cout << "\nAll tests completed!" << std::endl; + return 0; +} diff --git a/tools/frameforge/CMakeLists.txt b/tools/frameforge/CMakeLists.txt index c75aa487fa7..88e23072a95 100644 --- a/tools/frameforge/CMakeLists.txt +++ b/tools/frameforge/CMakeLists.txt @@ -6,6 +6,7 @@ add_executable(${TARGET} frameforge-validator.cpp frameforge-json.cpp frameforge-ipc.cpp + frameforge-audio.cpp ) target_include_directories(${TARGET} PRIVATE @@ -32,6 +33,22 @@ else() message(WARNING "Whisper not found at ${CMAKE_SOURCE_DIR}/external/whisper, frameforge-sidecar will build without Whisper support") endif() +# PortAudio for live audio capture +find_package(PkgConfig) +if(PkgConfig_FOUND) + pkg_check_modules(PORTAUDIO portaudio-2.0) + if(PORTAUDIO_FOUND) + target_include_directories(${TARGET} PRIVATE ${PORTAUDIO_INCLUDE_DIRS}) + target_link_libraries(${TARGET} PRIVATE ${PORTAUDIO_LIBRARIES}) + target_compile_definitions(${TARGET} PRIVATE FRAMEFORGE_PORTAUDIO_SUPPORT) + message(STATUS "PortAudio found, enabling live audio capture") + else() + message(WARNING "PortAudio not found, frameforge-sidecar will build without live audio capture support") + endif() +else() + message(WARNING "PkgConfig not found, cannot detect PortAudio") +endif() + # Platform-specific libraries if(WIN32) # Windows-specific libraries diff --git a/tools/frameforge/README.md b/tools/frameforge/README.md index 45654198cad..1fe25a8cf15 100644 --- a/tools/frameforge/README.md +++ b/tools/frameforge/README.md @@ -5,7 +5,7 @@ This tool integrates Whisper.cpp for speech-to-text and Llama.cpp for intent cla ## Overview The FrameForge Sidecar is a 64-bit resident process that: -1. Receives audio input (via file or IPC) +1. Receives audio input (via file, IPC, or live microphone capture with PortAudio) 2. Transcribes audio to text using Whisper 3. Classifies intent and extracts parameters using Llama 4. Validates commands against a strict schema @@ -116,6 +116,30 @@ cmake --build build --config Release The binary will be located at: `build/bin/frameforge-sidecar` +### Dependencies + +**Required:** +- Llama.cpp (built-in) + +**Optional:** +- Whisper.cpp - For speech-to-text transcription (recommended) +- PortAudio - For live microphone audio capture (recommended for production use) + +To enable PortAudio support, install the development library before building: + +**Ubuntu/Debian:** +```bash +sudo apt-get install portaudio19-dev +``` + +**macOS:** +```bash +brew install portaudio +``` + +**Windows:** +Download and install PortAudio from http://www.portaudio.com/ + ## Usage ### Test Mode (with audio file) @@ -128,6 +152,24 @@ The binary will be located at: `build/bin/frameforge-sidecar` --verbose ``` +### Live Audio Capture Mode (with microphone) + +**Requires both PortAudio and Whisper support (see Building section)** + +```bash +./build/bin/frameforge-sidecar \ + --whisper-model /path/to/whisper-model.bin \ + --llama-model /path/to/llama-model.gguf \ + --live-audio \ + --verbose +``` + +This mode continuously captures audio from the default microphone using Voice Activity Detection (VAD). It automatically detects when speech begins and ends, then processes the audio when: +1. At least 500ms of speech is detected +2. Followed by 250ms of silence + +The captured speech is then transcribed with Whisper, classified with Llama, and validated. The audio buffer is cleared after each processing cycle. + ### Server Mode (IPC with Named Pipes) ```bash @@ -139,9 +181,10 @@ The binary will be located at: `build/bin/frameforge-sidecar` ### Command-Line Options -- `-wm, --whisper-model FNAME` - Path to Whisper model file (required) +- `-wm, --whisper-model FNAME` - Path to Whisper model file (required if Whisper support is compiled) - `-lm, --llama-model FNAME` - Path to Llama model file (required) - `-a, --audio FILE` - Audio file to transcribe (for testing) +- `-la, --live-audio` - Enable live audio capture via PortAudio (requires PortAudio support) - `-p, --pipe NAME` - Named pipe name (default: frameforge_pipe) - `-vd, --verb-defs FILE` - Path to verb definitions JSON file (optional) - `-t, --threads N` - Number of threads (default: 4) diff --git a/tools/frameforge/frameforge-audio.cpp b/tools/frameforge/frameforge-audio.cpp new file mode 100644 index 00000000000..8929fc5a616 --- /dev/null +++ b/tools/frameforge/frameforge-audio.cpp @@ -0,0 +1,305 @@ +#include "frameforge-audio.h" + +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT +#include +#endif + +#include +#include +#include +#include + +namespace frameforge { + +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT + +// Global PortAudio initialization tracking (protected by mutex) +static std::mutex g_portaudio_mutex; +static bool g_portaudio_initialized = false; +static int g_portaudio_ref_count = 0; + +AudioCapture::AudioCapture(const AudioConfig & config) + : config_(config) + , callback_(nullptr) + , capturing_(false) + , stream_(nullptr) + , initialized_(false) + , ready_to_process_(false) + , has_speech_(false) + , speech_sample_count_(0) + , silence_sample_count_(0) { + // Calculate sample thresholds + min_speech_samples_ = static_cast( + (config_.min_speech_duration_ms / 1000.0f) * config_.sample_rate + ); + silence_samples_threshold_ = static_cast( + (config_.silence_duration_ms / 1000.0f) * config_.sample_rate + ); + max_buffer_samples_ = static_cast( + config_.max_buffer_duration_s * config_.sample_rate * config_.channels + ); +} + +AudioCapture::~AudioCapture() { + stop(); + if (stream_) { + Pa_CloseStream(static_cast(stream_)); + stream_ = nullptr; + } + + // Decrement reference count and terminate PortAudio if needed + if (initialized_) { + std::lock_guard lock(g_portaudio_mutex); + g_portaudio_ref_count--; + if (g_portaudio_ref_count == 0) { + Pa_Terminate(); + g_portaudio_initialized = false; + } + } +} + +bool AudioCapture::initialize() { + // Check if already initialized + if (initialized_) { + std::cerr << "Warning: AudioCapture already initialized" << std::endl; + return true; + } + + // Initialize PortAudio if not already initialized (thread-safe) + { + std::lock_guard lock(g_portaudio_mutex); + if (!g_portaudio_initialized) { + PaError err = Pa_Initialize(); + if (err != paNoError) { + std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; + return false; + } + g_portaudio_initialized = true; + } + g_portaudio_ref_count++; + } + initialized_ = true; + + // Get default input device + PaDeviceIndex device = Pa_GetDefaultInputDevice(); + if (device == paNoDevice) { + std::cerr << "Error: No default input device found" << std::endl; + // Decrement ref count on error + std::lock_guard lock(g_portaudio_mutex); + g_portaudio_ref_count--; + initialized_ = false; + return false; + } + + // Print device info + const PaDeviceInfo * device_info = Pa_GetDeviceInfo(device); + if (device_info) { + std::cout << "Using audio device: " << device_info->name << std::endl; + std::cout << " Sample rate: " << config_.sample_rate << " Hz" << std::endl; + std::cout << " Channels: " << config_.channels << std::endl; + } else { + std::cerr << "Error: Could not get device info" << std::endl; + // Decrement ref count on error + std::lock_guard lock(g_portaudio_mutex); + g_portaudio_ref_count--; + initialized_ = false; + return false; + } + + // Set up stream parameters + PaStreamParameters input_params; + input_params.device = device; + input_params.channelCount = config_.channels; + input_params.sampleFormat = paFloat32; + input_params.suggestedLatency = device_info->defaultLowInputLatency; + input_params.hostApiSpecificStreamInfo = nullptr; + + // Open audio stream + PaStream * pa_stream = nullptr; + PaError err = Pa_OpenStream( + &pa_stream, &input_params, + nullptr, // no output + config_.sample_rate, config_.frames_per_buffer, paClipOff, + pa_callback, this); + + if (err != paNoError) { + std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; + // Decrement ref count on error + std::lock_guard lock(g_portaudio_mutex); + g_portaudio_ref_count--; + initialized_ = false; + return false; + } + + stream_ = pa_stream; + return true; +} + +bool AudioCapture::start() { + if (!stream_) { + std::cerr << "Error: Audio stream not initialized" << std::endl; + return false; + } + + if (capturing_) { + return true; // Already capturing + } + + PaError err = Pa_StartStream(static_cast(stream_)); + if (err != paNoError) { + std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; + return false; + } + + capturing_ = true; + std::cout << "Audio capture started" << std::endl; + return true; +} + +void AudioCapture::stop() { + if (!capturing_) { + return; + } + + if (stream_) { + PaError err = Pa_StopStream(static_cast(stream_)); + if (err != paNoError) { + std::cerr << "PortAudio error when stopping stream: " << Pa_GetErrorText(err) << std::endl; + return; + } + } + + capturing_ = false; + std::cout << "Audio capture stopped" << std::endl; +} + +void AudioCapture::set_callback(AudioCallback callback) { + callback_ = callback; +} + +std::vector AudioCapture::get_audio_buffer() { + std::lock_guard lock(buffer_mutex_); + std::vector buffer_copy; + buffer_copy.swap(audio_buffer_); + return buffer_copy; +} + +void AudioCapture::clear_buffer() { + std::lock_guard lock(buffer_mutex_); + audio_buffer_.clear(); +} + +void AudioCapture::reset_vad_state() { + std::lock_guard lock(vad_mutex_); + ready_to_process_ = false; + has_speech_ = false; + speech_sample_count_ = 0; + silence_sample_count_ = 0; +} + +float AudioCapture::calculate_rms(const float * data, size_t sample_count) const { + if (!data || sample_count == 0) { + return 0.0f; + } + + float sum_squares = 0.0f; + for (size_t i = 0; i < sample_count; ++i) { + sum_squares += data[i] * data[i]; + } + + return std::sqrt(sum_squares / sample_count); +} + +bool AudioCapture::is_speech(const float * data, size_t sample_count) const { + float rms = calculate_rms(data, sample_count); + return rms > config_.vad_threshold; +} + +int AudioCapture::pa_callback(const void * input, void * output, unsigned long frame_count, + const PaStreamCallbackTimeInfo * time_info, + PaStreamCallbackFlags status_flags, void * user_data) { + (void) output; + (void) time_info; + (void) status_flags; + + AudioCapture * capture = static_cast(user_data); + const float * in = static_cast(input); + + if (in && capture) { + capture->handle_audio_data(in, frame_count); + } + + return paContinue; +} + +void AudioCapture::handle_audio_data(const float * data, unsigned long frame_count) { + if (!data || frame_count == 0) { + return; + } + + // Calculate total samples (frames * channels) + size_t total_samples = frame_count * config_.channels; + + // Store in buffer with size check to prevent unbounded growth + { + std::lock_guard lock(buffer_mutex_); + + // Check if adding this data would exceed maximum buffer size + if (audio_buffer_.size() + total_samples > max_buffer_samples_) { + // Buffer is too large - remove oldest samples to make room + // This implements a rolling window approach + size_t samples_to_remove = (audio_buffer_.size() + total_samples) - max_buffer_samples_; + audio_buffer_.erase(audio_buffer_.begin(), audio_buffer_.begin() + samples_to_remove); + } + + audio_buffer_.insert(audio_buffer_.end(), data, data + total_samples); + } + + // Perform voice activity detection + bool current_is_speech = is_speech(data, total_samples); + + // Update VAD state atomically + { + std::lock_guard lock(vad_mutex_); + + if (current_is_speech) { + // We have speech + speech_sample_count_ += total_samples; + silence_sample_count_ = 0; // Reset silence counter + + // Mark that we've detected speech + if (speech_sample_count_ >= min_speech_samples_) { + has_speech_ = true; + } + } else { + // We have silence + if (has_speech_) { + // We had speech before, now counting silence + silence_sample_count_ += total_samples; + + // Check if we've had enough silence to trigger processing + if (silence_sample_count_ >= silence_samples_threshold_) { + ready_to_process_ = true; + } + } + // If we don't have speech yet, keep resetting counters + else { + speech_sample_count_ = 0; + silence_sample_count_ = 0; + } + } + } + + // Call user callback if set + // Note: Creating vector from raw data involves a copy, but this matches + // the callback signature. For high-performance use cases, consider using + // the internal buffer directly or redesigning the callback interface. + if (callback_) { + const std::vector callback_data(data, data + total_samples); + callback_(callback_data); + } +} + +#endif // FRAMEFORGE_PORTAUDIO_SUPPORT + +} // namespace frameforge diff --git a/tools/frameforge/frameforge-audio.h b/tools/frameforge/frameforge-audio.h new file mode 100644 index 00000000000..555f00ed401 --- /dev/null +++ b/tools/frameforge/frameforge-audio.h @@ -0,0 +1,128 @@ +#ifndef FRAMEFORGE_AUDIO_H +#define FRAMEFORGE_AUDIO_H + +#include +#include +#include +#include + +// Forward declarations for PortAudio types +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT +struct PaStreamCallbackTimeInfo; +typedef unsigned long PaStreamCallbackFlags; +#endif + +namespace frameforge { + +// Audio capture configuration +struct AudioConfig { + int sample_rate = 16000; // Sample rate in Hz (16kHz is standard for Whisper) + int channels = 1; // Number of channels (1 = mono) + int frames_per_buffer = 512; // Number of frames per buffer + float vad_threshold = 0.01f; // Voice activity detection threshold (RMS) + float min_speech_duration_ms = 500.0f; // Minimum speech duration in milliseconds + float silence_duration_ms = 250.0f; // Silence duration to trigger processing + float max_buffer_duration_s = 30.0f; // Maximum buffer duration in seconds (prevents unbounded growth) +}; + +// Audio capture callback function type +// Called when audio data is available +// Parameters: PCM float data, number of samples +using AudioCallback = std::function &)>; + +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT + +// Audio capture class using PortAudio +class AudioCapture { +public: + AudioCapture(const AudioConfig & config = AudioConfig()); + ~AudioCapture(); + + // Initialize the audio capture system + bool initialize(); + + // Start capturing audio + bool start(); + + // Stop capturing audio + void stop(); + + // Check if currently capturing + bool is_capturing() const { return capturing_; } + + // Set callback for audio data + void set_callback(AudioCallback callback); + + // Get captured audio buffer (for accumulated audio) + std::vector get_audio_buffer(); + + // Clear the audio buffer + void clear_buffer(); + + // Check if ready to process (speech detected + silence after) + bool is_ready_to_process() const { + std::lock_guard lock(const_cast(vad_mutex_)); + return ready_to_process_; + } + + // Reset VAD state + void reset_vad_state(); + +private: + AudioConfig config_; + AudioCallback callback_; + std::atomic capturing_; + std::vector audio_buffer_; + std::mutex buffer_mutex_; + void * stream_; // PaStream* (opaque pointer to avoid including portaudio.h here) + bool initialized_; // Track if this instance has initialized PortAudio + + // VAD state tracking (protected by vad_mutex_) + std::mutex vad_mutex_; + bool ready_to_process_; + bool has_speech_; + size_t speech_sample_count_; + size_t silence_sample_count_; + size_t min_speech_samples_; + size_t silence_samples_threshold_; + size_t max_buffer_samples_; // Maximum buffer size in samples + + // PortAudio callback (static function) + static int pa_callback(const void * input, void * output, unsigned long frame_count, + const PaStreamCallbackTimeInfo * time_info, + PaStreamCallbackFlags status_flags, void * user_data); + + // Instance callback handler + void handle_audio_data(const float * data, unsigned long frame_count); + + // Calculate RMS (root mean square) of audio data + float calculate_rms(const float * data, size_t sample_count) const; + + // Check if audio chunk contains speech + bool is_speech(const float * data, size_t sample_count) const; +}; + +#else + +// Stub implementation when PortAudio is not available +class AudioCapture { +public: + AudioCapture(const AudioConfig & config = AudioConfig()) { (void) config; } + ~AudioCapture() {} + + bool initialize() { return false; } + bool start() { return false; } + void stop() {} + bool is_capturing() const { return false; } + void set_callback(AudioCallback callback) { (void) callback; } + std::vector get_audio_buffer() { return std::vector(); } + void clear_buffer() {} + bool is_ready_to_process() const { return false; } + void reset_vad_state() {} +}; + +#endif + +} // namespace frameforge + +#endif // FRAMEFORGE_AUDIO_H diff --git a/tools/frameforge/frameforge-sidecar.cpp b/tools/frameforge/frameforge-sidecar.cpp index b9620890ca1..35c286ca627 100644 --- a/tools/frameforge/frameforge-sidecar.cpp +++ b/tools/frameforge/frameforge-sidecar.cpp @@ -2,6 +2,7 @@ #ifdef FRAMEFORGE_WHISPER_SUPPORT #include "../../external/whisper/include/whisper.h" #endif +#include "frameforge-audio.h" #include "frameforge-ipc.h" #include "frameforge-json.h" #include "frameforge-schema.h" @@ -10,6 +11,7 @@ #include #include +#include #include #include #include @@ -18,6 +20,16 @@ #include #include +// Global flag for signal handling +static std::atomic g_running(true); + +// Signal handler for graceful shutdown +static void signal_handler(int signal) { + if (signal == SIGINT || signal == SIGTERM) { + g_running = false; + } +} + // System prompt for Llama intent classification static const char * INTENT_SYSTEM_PROMPT = R"(You are an intent classifier for FrameForge Studio, a professional previsualization software. @@ -78,6 +90,7 @@ struct frameforge_params { std::string verb_definitions_file; // Path to verb definitions JSON int n_threads = 4; bool verbose = false; + bool live_audio = false; // Enable live audio capture with PortAudio }; static void print_usage(const char * argv0) { @@ -88,6 +101,9 @@ static void print_usage(const char * argv0) { #endif fprintf(stderr, " -lm, --llama-model FNAME Path to Llama model file\n"); fprintf(stderr, " -a, --audio FILE Audio file to transcribe (for testing)\n"); +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT + fprintf(stderr, " -la, --live-audio Enable live audio capture via PortAudio\n"); +#endif fprintf(stderr, " -p, --pipe NAME Named pipe name (default: frameforge_pipe)\n"); fprintf(stderr, " -vd, --verb-defs FILE Path to verb definitions JSON file\n"); fprintf(stderr, " -t, --threads N Number of threads (default: 4)\n"); @@ -123,6 +139,10 @@ static bool parse_params(int argc, char ** argv, frameforge_params & params) { fprintf(stderr, "Error: Missing value for %s\n", arg.c_str()); return false; } +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT + } else if (arg == "-la" || arg == "--live-audio") { + params.live_audio = true; +#endif } else if (arg == "-p" || arg == "--pipe") { if (i + 1 < argc) { params.pipe_name = argv[++i]; @@ -332,6 +352,10 @@ int main(int argc, char ** argv) { return 1; } + // Set up signal handlers for graceful shutdown (once at the start) + std::signal(SIGINT, signal_handler); + std::signal(SIGTERM, signal_handler); + // Load verb definitions if provided if (!params.verb_definitions_file.empty()) { fprintf(stderr, "Loading verb definitions from: %s\n", params.verb_definitions_file.c_str()); @@ -437,6 +461,105 @@ int main(int argc, char ** argv) { return 0; } + // Live audio capture mode: capture audio from microphone +#if defined(FRAMEFORGE_PORTAUDIO_SUPPORT) && defined(FRAMEFORGE_WHISPER_SUPPORT) + if (params.live_audio) { + fprintf(stderr, "Starting live audio capture mode...\n"); + + frameforge::AudioConfig audio_config; + audio_config.sample_rate = 16000; // 16kHz for Whisper + audio_config.channels = 1; // Mono + audio_config.frames_per_buffer = 512; + + frameforge::AudioCapture audio_capture(audio_config); + + if (!audio_capture.initialize()) { + fprintf(stderr, "Error: Failed to initialize audio capture\n"); + llama_free(lctx); + llama_model_free(model); + whisper_free(wctx); + return 1; + } + + if (!audio_capture.start()) { + fprintf(stderr, "Error: Failed to start audio capture\n"); + llama_free(lctx); + llama_model_free(model); + whisper_free(wctx); + return 1; + } + + fprintf(stderr, "FrameForge Sidecar ready. Listening to microphone...\n"); + fprintf(stderr, "Press Ctrl+C to stop\n"); + fprintf(stderr, "VAD settings: min_speech=%.0fms, silence=%.0fms\n", + audio_config.min_speech_duration_ms, audio_config.silence_duration_ms); + + // Main loop for live audio with VAD + while (g_running) { + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + // Check if VAD has detected speech followed by silence + if (audio_capture.is_ready_to_process()) { + // Get accumulated audio buffer + std::vector audio_buffer = audio_capture.get_audio_buffer(); + + fprintf(stderr, "\nProcessing %.2f seconds of audio...\n", + static_cast(audio_buffer.size()) / audio_config.sample_rate); + + // Transcribe audio + fprintf(stderr, "Transcribing audio...\n"); + std::string transcription = transcribe_audio(wctx, audio_buffer, params.verbose); + + if (!transcription.empty()) { + fprintf(stderr, "Transcription: %s\n", transcription.c_str()); + + // Classify intent + fprintf(stderr, "Classifying intent...\n"); + std::string llm_response = classify_intent(lctx, model, transcription, params.verbose); + fprintf(stderr, "LLM Response: %s\n", llm_response.c_str()); + + // Validate the command + frameforge::Command cmd; + frameforge::ValidationResult result = validator.validate_json(llm_response, cmd); + + if (result.valid) { + std::string json_output = frameforge::command_to_json(cmd); + fprintf(stderr, "Valid command:\n%s\n", json_output.c_str()); + } else { + fprintf(stderr, "Validation failed: %s\n", result.error_message.c_str()); + std::string clarification = validator.generate_clarification_request(result, cmd); + fprintf(stderr, "Clarification: %s\n", clarification.c_str()); + } + } else { + fprintf(stderr, "No transcription generated (silence or noise)\n"); + } + + // Clear the buffer and reset VAD state after processing + audio_capture.clear_buffer(); + audio_capture.reset_vad_state(); + + fprintf(stderr, "\nListening...\n"); + } + } + + audio_capture.stop(); + llama_free(lctx); + llama_model_free(model); + whisper_free(wctx); + return 0; + } +#else + if (params.live_audio) { + fprintf(stderr, "Error: Live audio capture requires both PortAudio and Whisper support\n"); + llama_free(lctx); + llama_model_free(model); +#ifdef FRAMEFORGE_WHISPER_SUPPORT + whisper_free(wctx); +#endif + return 1; + } +#endif + // Server mode: start IPC server fprintf(stderr, "Starting IPC server on pipe: %s\n", params.pipe_name.c_str()); frameforge::IPCServer ipc_server(params.pipe_name); @@ -452,10 +575,10 @@ int main(int argc, char ** argv) { } fprintf(stderr, "FrameForge Sidecar ready. Waiting for commands...\n"); + fprintf(stderr, "Press Ctrl+C to stop\n"); // Main loop - std::atomic running(true); - while (running) { + while (g_running) { // In a real implementation, this would: // 1. Receive audio data from the IPC pipe // 2. Transcribe with Whisper