From fa42a514c7610fd3c22337b3ba9e4759b5601969 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 1 Jan 2026 23:06:44 +0000 Subject: [PATCH 01/16] Initial plan From a14859433553c7a7ad3f6444c4558c67dfefb03d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 1 Jan 2026 23:13:55 +0000 Subject: [PATCH 02/16] Add PortAudio support for live audio capture in FrameForge SideCar Co-authored-by: TheOriginalBytePlayer <18058224+TheOriginalBytePlayer@users.noreply.github.com> --- tools/frameforge/CMakeLists.txt | 17 +++ tools/frameforge/frameforge-audio.cpp | 163 ++++++++++++++++++++++++ tools/frameforge/frameforge-audio.h | 89 +++++++++++++ tools/frameforge/frameforge-sidecar.cpp | 122 ++++++++++++++++++ 4 files changed, 391 insertions(+) create mode 100644 tools/frameforge/frameforge-audio.cpp create mode 100644 tools/frameforge/frameforge-audio.h diff --git a/tools/frameforge/CMakeLists.txt b/tools/frameforge/CMakeLists.txt index c75aa487fa7..88e23072a95 100644 --- a/tools/frameforge/CMakeLists.txt +++ b/tools/frameforge/CMakeLists.txt @@ -6,6 +6,7 @@ add_executable(${TARGET} frameforge-validator.cpp frameforge-json.cpp frameforge-ipc.cpp + frameforge-audio.cpp ) target_include_directories(${TARGET} PRIVATE @@ -32,6 +33,22 @@ else() message(WARNING "Whisper not found at ${CMAKE_SOURCE_DIR}/external/whisper, frameforge-sidecar will build without Whisper support") endif() +# PortAudio for live audio capture +find_package(PkgConfig) +if(PkgConfig_FOUND) + pkg_check_modules(PORTAUDIO portaudio-2.0) + if(PORTAUDIO_FOUND) + target_include_directories(${TARGET} PRIVATE ${PORTAUDIO_INCLUDE_DIRS}) + target_link_libraries(${TARGET} PRIVATE ${PORTAUDIO_LIBRARIES}) + target_compile_definitions(${TARGET} PRIVATE FRAMEFORGE_PORTAUDIO_SUPPORT) + message(STATUS "PortAudio found, enabling live audio capture") + else() + message(WARNING "PortAudio not found, frameforge-sidecar will build without live audio capture support") + endif() +else() + message(WARNING "PkgConfig not found, cannot detect PortAudio") +endif() + # Platform-specific libraries if(WIN32) # Windows-specific libraries diff --git a/tools/frameforge/frameforge-audio.cpp b/tools/frameforge/frameforge-audio.cpp new file mode 100644 index 00000000000..db70f4bdc07 --- /dev/null +++ b/tools/frameforge/frameforge-audio.cpp @@ -0,0 +1,163 @@ +#include "frameforge-audio.h" + +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT +#include +#endif + +#include +#include + +namespace frameforge { + +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT + +AudioCapture::AudioCapture(const AudioConfig & config) + : config_(config) + , callback_(nullptr) + , capturing_(false) + , stream_(nullptr) { +} + +AudioCapture::~AudioCapture() { + stop(); + if (stream_) { + Pa_CloseStream(static_cast(stream_)); + stream_ = nullptr; + } + Pa_Terminate(); +} + +bool AudioCapture::initialize() { + PaError err = Pa_Initialize(); + if (err != paNoError) { + std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; + return false; + } + + // Get default input device + PaDeviceIndex device = Pa_GetDefaultInputDevice(); + if (device == paNoDevice) { + std::cerr << "Error: No default input device found" << std::endl; + Pa_Terminate(); + return false; + } + + // Print device info + const PaDeviceInfo * device_info = Pa_GetDeviceInfo(device); + if (device_info) { + std::cout << "Using audio device: " << device_info->name << std::endl; + std::cout << " Sample rate: " << config_.sample_rate << " Hz" << std::endl; + std::cout << " Channels: " << config_.channels << std::endl; + } + + // Set up stream parameters + PaStreamParameters input_params; + input_params.device = device; + input_params.channelCount = config_.channels; + input_params.sampleFormat = paFloat32; + input_params.suggestedLatency = device_info->defaultLowInputLatency; + input_params.hostApiSpecificStreamInfo = nullptr; + + // Open audio stream + err = Pa_OpenStream( + reinterpret_cast(&stream_), &input_params, + nullptr, // no output + config_.sample_rate, config_.frames_per_buffer, paClipOff, pa_callback, this); + + if (err != paNoError) { + std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; + Pa_Terminate(); + return false; + } + + return true; +} + +bool AudioCapture::start() { + if (!stream_) { + std::cerr << "Error: Audio stream not initialized" << std::endl; + return false; + } + + if (capturing_) { + return true; // Already capturing + } + + PaError err = Pa_StartStream(static_cast(stream_)); + if (err != paNoError) { + std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; + return false; + } + + capturing_ = true; + std::cout << "Audio capture started" << std::endl; + return true; +} + +void AudioCapture::stop() { + if (!capturing_) { + return; + } + + if (stream_) { + Pa_StopStream(static_cast(stream_)); + } + + capturing_ = false; + std::cout << "Audio capture stopped" << std::endl; +} + +void AudioCapture::set_callback(AudioCallback callback) { + callback_ = callback; +} + +std::vector AudioCapture::get_audio_buffer() { + std::lock_guard lock(buffer_mutex_); + return audio_buffer_; +} + +void AudioCapture::clear_buffer() { + std::lock_guard lock(buffer_mutex_); + audio_buffer_.clear(); +} + +int AudioCapture::pa_callback(const void * input, void * output, unsigned long frame_count, + const void * time_info, unsigned long status_flags, void * user_data) { + (void) output; + (void) time_info; + (void) status_flags; + + AudioCapture * capture = static_cast(user_data); + const float * in = static_cast(input); + + if (in && capture) { + capture->handle_audio_data(in, frame_count); + } + + return paContinue; +} + +void AudioCapture::handle_audio_data(const float * data, unsigned long frame_count) { + if (!data || frame_count == 0) { + return; + } + + // Calculate total samples (frames * channels) + size_t total_samples = frame_count * config_.channels; + + // Store in buffer + { + std::lock_guard lock(buffer_mutex_); + audio_buffer_.insert(audio_buffer_.end(), data, data + total_samples); + } + + // Call user callback if set + if (callback_) { + std::vector callback_data(data, data + total_samples); + callback_(callback_data); + } +} + +#endif // FRAMEFORGE_PORTAUDIO_SUPPORT + +} // namespace frameforge diff --git a/tools/frameforge/frameforge-audio.h b/tools/frameforge/frameforge-audio.h new file mode 100644 index 00000000000..a49acd3a47a --- /dev/null +++ b/tools/frameforge/frameforge-audio.h @@ -0,0 +1,89 @@ +#ifndef FRAMEFORGE_AUDIO_H +#define FRAMEFORGE_AUDIO_H + +#include +#include +#include +#include + +namespace frameforge { + +// Audio capture configuration +struct AudioConfig { + int sample_rate = 16000; // Sample rate in Hz (16kHz is standard for Whisper) + int channels = 1; // Number of channels (1 = mono) + int frames_per_buffer = 512; // Number of frames per buffer +}; + +// Audio capture callback function type +// Called when audio data is available +// Parameters: PCM float data, number of samples +using AudioCallback = std::function &)>; + +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT + +// Audio capture class using PortAudio +class AudioCapture { +public: + AudioCapture(const AudioConfig & config = AudioConfig()); + ~AudioCapture(); + + // Initialize the audio capture system + bool initialize(); + + // Start capturing audio + bool start(); + + // Stop capturing audio + void stop(); + + // Check if currently capturing + bool is_capturing() const { return capturing_; } + + // Set callback for audio data + void set_callback(AudioCallback callback); + + // Get captured audio buffer (for accumulated audio) + std::vector get_audio_buffer(); + + // Clear the audio buffer + void clear_buffer(); + +private: + AudioConfig config_; + AudioCallback callback_; + std::atomic capturing_; + std::vector audio_buffer_; + std::mutex buffer_mutex_; + void * stream_; // PaStream* (opaque pointer to avoid including portaudio.h here) + + // PortAudio callback (static function) + static int pa_callback(const void * input, void * output, unsigned long frame_count, + const void * time_info, unsigned long status_flags, void * user_data); + + // Instance callback handler + void handle_audio_data(const float * data, unsigned long frame_count); +}; + +#else + +// Stub implementation when PortAudio is not available +class AudioCapture { +public: + AudioCapture(const AudioConfig & config = AudioConfig()) { (void) config; } + ~AudioCapture() {} + + bool initialize() { return false; } + bool start() { return false; } + void stop() {} + bool is_capturing() const { return false; } + void set_callback(AudioCallback callback) { (void) callback; } + std::vector get_audio_buffer() { return std::vector(); } + void clear_buffer() {} +}; + +#endif + +} // namespace frameforge + +#endif // FRAMEFORGE_AUDIO_H diff --git a/tools/frameforge/frameforge-sidecar.cpp b/tools/frameforge/frameforge-sidecar.cpp index b9620890ca1..9f88c6ab5f7 100644 --- a/tools/frameforge/frameforge-sidecar.cpp +++ b/tools/frameforge/frameforge-sidecar.cpp @@ -2,6 +2,7 @@ #ifdef FRAMEFORGE_WHISPER_SUPPORT #include "../../external/whisper/include/whisper.h" #endif +#include "frameforge-audio.h" #include "frameforge-ipc.h" #include "frameforge-json.h" #include "frameforge-schema.h" @@ -78,6 +79,7 @@ struct frameforge_params { std::string verb_definitions_file; // Path to verb definitions JSON int n_threads = 4; bool verbose = false; + bool live_audio = false; // Enable live audio capture with PortAudio }; static void print_usage(const char * argv0) { @@ -88,6 +90,9 @@ static void print_usage(const char * argv0) { #endif fprintf(stderr, " -lm, --llama-model FNAME Path to Llama model file\n"); fprintf(stderr, " -a, --audio FILE Audio file to transcribe (for testing)\n"); +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT + fprintf(stderr, " -la, --live-audio Enable live audio capture via PortAudio\n"); +#endif fprintf(stderr, " -p, --pipe NAME Named pipe name (default: frameforge_pipe)\n"); fprintf(stderr, " -vd, --verb-defs FILE Path to verb definitions JSON file\n"); fprintf(stderr, " -t, --threads N Number of threads (default: 4)\n"); @@ -123,6 +128,10 @@ static bool parse_params(int argc, char ** argv, frameforge_params & params) { fprintf(stderr, "Error: Missing value for %s\n", arg.c_str()); return false; } +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT + } else if (arg == "-la" || arg == "--live-audio") { + params.live_audio = true; +#endif } else if (arg == "-p" || arg == "--pipe") { if (i + 1 < argc) { params.pipe_name = argv[++i]; @@ -437,6 +446,119 @@ int main(int argc, char ** argv) { return 0; } + // Live audio capture mode: capture audio from microphone +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT + if (params.live_audio) { + fprintf(stderr, "Starting live audio capture mode...\n"); + + frameforge::AudioConfig audio_config; + audio_config.sample_rate = 16000; // 16kHz for Whisper + audio_config.channels = 1; // Mono + audio_config.frames_per_buffer = 512; + + frameforge::AudioCapture audio_capture(audio_config); + + if (!audio_capture.initialize()) { + fprintf(stderr, "Error: Failed to initialize audio capture\n"); + llama_free(lctx); + llama_model_free(model); +#ifdef FRAMEFORGE_WHISPER_SUPPORT + whisper_free(wctx); +#endif + return 1; + } + + if (!audio_capture.start()) { + fprintf(stderr, "Error: Failed to start audio capture\n"); + llama_free(lctx); + llama_model_free(model); +#ifdef FRAMEFORGE_WHISPER_SUPPORT + whisper_free(wctx); +#endif + return 1; + } + + // Initialize command validator + frameforge::CommandValidator validator; + + fprintf(stderr, "FrameForge Sidecar ready. Listening to microphone...\n"); + fprintf(stderr, "Press Ctrl+C to stop\n"); + + // Main loop for live audio + std::atomic running(true); + constexpr float MIN_AUDIO_DURATION_SEC = 2.0f; // Process at least 2 seconds of audio + const size_t MIN_AUDIO_SAMPLES = static_cast(MIN_AUDIO_DURATION_SEC * audio_config.sample_rate); + + while (running) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Get accumulated audio buffer + std::vector audio_buffer = audio_capture.get_audio_buffer(); + + // Check if we have enough audio to process + if (audio_buffer.size() >= MIN_AUDIO_SAMPLES) { + fprintf(stderr, "\nProcessing %.2f seconds of audio...\n", + static_cast(audio_buffer.size()) / audio_config.sample_rate); + +#ifdef FRAMEFORGE_WHISPER_SUPPORT + // Transcribe audio + fprintf(stderr, "Transcribing audio...\n"); + std::string transcription = transcribe_audio(wctx, audio_buffer, params.verbose); + + if (!transcription.empty()) { + fprintf(stderr, "Transcription: %s\n", transcription.c_str()); + + // Classify intent + fprintf(stderr, "Classifying intent...\n"); + std::string llm_response = classify_intent(lctx, model, transcription, params.verbose); + fprintf(stderr, "LLM Response: %s\n", llm_response.c_str()); + + // Validate the command + frameforge::Command cmd; + frameforge::ValidationResult result = validator.validate_json(llm_response, cmd); + + if (result.valid) { + std::string json_output = frameforge::command_to_json(cmd); + fprintf(stderr, "Valid command:\n%s\n", json_output.c_str()); + } else { + fprintf(stderr, "Validation failed: %s\n", result.error_message.c_str()); + std::string clarification = validator.generate_clarification_request(result, cmd); + fprintf(stderr, "Clarification: %s\n", clarification.c_str()); + } + } else { + fprintf(stderr, "No transcription generated (silence or noise)\n"); + } +#else + fprintf(stderr, "Error: Audio transcription requires Whisper support\n"); +#endif + + // Clear the buffer after processing + audio_capture.clear_buffer(); + + fprintf(stderr, "\nListening...\n"); + } + } + + audio_capture.stop(); + llama_free(lctx); + llama_model_free(model); +#ifdef FRAMEFORGE_WHISPER_SUPPORT + whisper_free(wctx); +#endif + return 0; + } +#else + if (params.live_audio) { + fprintf(stderr, "Error: Live audio capture requires PortAudio support (not compiled)\n"); + llama_free(lctx); + llama_model_free(model); +#ifdef FRAMEFORGE_WHISPER_SUPPORT + whisper_free(wctx); +#endif + return 1; + } +#endif + // Server mode: start IPC server fprintf(stderr, "Starting IPC server on pipe: %s\n", params.pipe_name.c_str()); frameforge::IPCServer ipc_server(params.pipe_name); From b8296936424212ddad3ea9fcbb7ba0a89d55aa32 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 1 Jan 2026 23:23:37 +0000 Subject: [PATCH 03/16] Update documentation for live audio capture feature Co-authored-by: TheOriginalBytePlayer <18058224+TheOriginalBytePlayer@users.noreply.github.com> --- tools/frameforge/README.md | 44 +++++++++++++++++++++++++-- tools/frameforge/frameforge-audio.cpp | 3 +- tools/frameforge/frameforge-audio.h | 1 + 3 files changed, 45 insertions(+), 3 deletions(-) diff --git a/tools/frameforge/README.md b/tools/frameforge/README.md index 45654198cad..d55fba1f0f2 100644 --- a/tools/frameforge/README.md +++ b/tools/frameforge/README.md @@ -5,7 +5,7 @@ This tool integrates Whisper.cpp for speech-to-text and Llama.cpp for intent cla ## Overview The FrameForge Sidecar is a 64-bit resident process that: -1. Receives audio input (via file or IPC) +1. Receives audio input (via file, IPC, or live microphone capture with PortAudio) 2. Transcribes audio to text using Whisper 3. Classifies intent and extracts parameters using Llama 4. Validates commands against a strict schema @@ -116,6 +116,30 @@ cmake --build build --config Release The binary will be located at: `build/bin/frameforge-sidecar` +### Dependencies + +**Required:** +- Llama.cpp (built-in) + +**Optional:** +- Whisper.cpp - For speech-to-text transcription (recommended) +- PortAudio - For live microphone audio capture (recommended for production use) + +To enable PortAudio support, install the development library before building: + +**Ubuntu/Debian:** +```bash +sudo apt-get install portaudio19-dev +``` + +**macOS:** +```bash +brew install portaudio +``` + +**Windows:** +Download and install PortAudio from http://www.portaudio.com/ + ## Usage ### Test Mode (with audio file) @@ -128,6 +152,20 @@ The binary will be located at: `build/bin/frameforge-sidecar` --verbose ``` +### Live Audio Capture Mode (with microphone) + +**Requires PortAudio support (see Building section)** + +```bash +./build/bin/frameforge-sidecar \ + --whisper-model /path/to/whisper-model.bin \ + --llama-model /path/to/llama-model.gguf \ + --live-audio \ + --verbose +``` + +This mode continuously captures audio from the default microphone, processes it in chunks (minimum 2 seconds), transcribes with Whisper, classifies with Llama, and validates commands. The audio buffer is cleared after each processing cycle. + ### Server Mode (IPC with Named Pipes) ```bash @@ -139,14 +177,16 @@ The binary will be located at: `build/bin/frameforge-sidecar` ### Command-Line Options -- `-wm, --whisper-model FNAME` - Path to Whisper model file (required) +- `-wm, --whisper-model FNAME` - Path to Whisper model file (required if Whisper support is compiled) - `-lm, --llama-model FNAME` - Path to Llama model file (required) - `-a, --audio FILE` - Audio file to transcribe (for testing) +- `-la, --live-audio` - Enable live audio capture via PortAudio (requires PortAudio support) - `-p, --pipe NAME` - Named pipe name (default: frameforge_pipe) - `-vd, --verb-defs FILE` - Path to verb definitions JSON file (optional) - `-t, --threads N` - Number of threads (default: 4) - `-v, --verbose` - Enable verbose output - `-h, --help` - Show help message +- `-h, --help` - Show help message ### Verb Definitions diff --git a/tools/frameforge/frameforge-audio.cpp b/tools/frameforge/frameforge-audio.cpp index db70f4bdc07..d94bf0d2c70 100644 --- a/tools/frameforge/frameforge-audio.cpp +++ b/tools/frameforge/frameforge-audio.cpp @@ -62,7 +62,8 @@ bool AudioCapture::initialize() { err = Pa_OpenStream( reinterpret_cast(&stream_), &input_params, nullptr, // no output - config_.sample_rate, config_.frames_per_buffer, paClipOff, pa_callback, this); + config_.sample_rate, config_.frames_per_buffer, paClipOff, + reinterpret_cast(pa_callback), this); if (err != paNoError) { std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; diff --git a/tools/frameforge/frameforge-audio.h b/tools/frameforge/frameforge-audio.h index a49acd3a47a..1255d984159 100644 --- a/tools/frameforge/frameforge-audio.h +++ b/tools/frameforge/frameforge-audio.h @@ -58,6 +58,7 @@ class AudioCapture { void * stream_; // PaStream* (opaque pointer to avoid including portaudio.h here) // PortAudio callback (static function) + // Uses void* for time_info to avoid including portaudio.h in the header static int pa_callback(const void * input, void * output, unsigned long frame_count, const void * time_info, unsigned long status_flags, void * user_data); From 11166a477d32a2911e690941536107e673383442 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 1 Jan 2026 23:25:37 +0000 Subject: [PATCH 04/16] Add test for audio capture functionality Co-authored-by: TheOriginalBytePlayer <18058224+TheOriginalBytePlayer@users.noreply.github.com> --- tests/CMakeLists.txt | 14 ++++ tests/test-frameforge-audio.cpp | 121 ++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 tests/test-frameforge-audio.cpp diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b61d463d593..7d9f645814d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -276,5 +276,19 @@ target_link_libraries(test-frameforge-new-features PRIVATE common) add_test(NAME test-frameforge-new-features COMMAND $ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/tests) set_property(TEST test-frameforge-new-features PROPERTY LABELS "main") +# FrameForge Audio Capture Test +add_executable(test-frameforge-audio test-frameforge-audio.cpp + ${CMAKE_SOURCE_DIR}/tools/frameforge/frameforge-audio.cpp) +target_include_directories(test-frameforge-audio PRIVATE ${CMAKE_SOURCE_DIR}/tools/frameforge ${CMAKE_SOURCE_DIR}/vendor) +target_link_libraries(test-frameforge-audio PRIVATE common) +# Add PortAudio if available +if(PORTAUDIO_FOUND) + target_include_directories(test-frameforge-audio PRIVATE ${PORTAUDIO_INCLUDE_DIRS}) + target_link_libraries(test-frameforge-audio PRIVATE ${PORTAUDIO_LIBRARIES}) + target_compile_definitions(test-frameforge-audio PRIVATE FRAMEFORGE_PORTAUDIO_SUPPORT) +endif() +add_test(NAME test-frameforge-audio COMMAND $ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/tests) +set_property(TEST test-frameforge-audio PROPERTY LABELS "main") + llama_build_and_test(test-alloc.cpp) target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) diff --git a/tests/test-frameforge-audio.cpp b/tests/test-frameforge-audio.cpp new file mode 100644 index 00000000000..371fcf81874 --- /dev/null +++ b/tests/test-frameforge-audio.cpp @@ -0,0 +1,121 @@ +#include "../tools/frameforge/frameforge-audio.h" + +#include +#include +#include +#include + +using namespace frameforge; + +static void test_audio_config() { + std::cout << "Testing audio configuration..." << std::endl; + + AudioConfig config; + assert(config.sample_rate == 16000); + assert(config.channels == 1); + assert(config.frames_per_buffer == 512); + + // Custom config + AudioConfig custom; + custom.sample_rate = 44100; + custom.channels = 2; + custom.frames_per_buffer = 1024; + + assert(custom.sample_rate == 44100); + assert(custom.channels == 2); + assert(custom.frames_per_buffer == 1024); + + std::cout << " ✓ Audio configuration passed" << std::endl; +} + +static void test_audio_capture_initialization() { + std::cout << "Testing audio capture initialization..." << std::endl; + + AudioConfig config; + AudioCapture capture(config); + + // Just test that we can create an instance + assert(!capture.is_capturing()); + +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT + std::cout << " PortAudio support is available" << std::endl; + + // Try to initialize + bool init_result = capture.initialize(); + if (init_result) { + std::cout << " ✓ Audio capture initialization succeeded" << std::endl; + + // Test buffer operations + capture.clear_buffer(); + std::vector buffer = capture.get_audio_buffer(); + assert(buffer.empty()); + std::cout << " ✓ Buffer operations work" << std::endl; + } else { + std::cout << " ! Audio capture initialization failed (this is OK if no audio device is available)" << std::endl; + } +#else + std::cout << " PortAudio support is not available" << std::endl; + std::cout << " ✓ Stub implementation works" << std::endl; +#endif +} + +static void test_audio_callback() { + std::cout << "Testing audio callback..." << std::endl; + +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT + AudioConfig config; + AudioCapture capture(config); + + bool callback_called = false; + capture.set_callback([&callback_called](const std::vector & data) { + callback_called = true; + std::cout << " Callback received " << data.size() << " samples" << std::endl; + }); + + if (capture.initialize()) { + if (capture.start()) { + std::cout << " Audio capture started, waiting for callback..." << std::endl; + + // Wait for a short time to see if we get audio data + auto start_time = std::chrono::steady_clock::now(); + while (!callback_called) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_time + ).count(); + + if (elapsed > 2) { + std::cout << " ! No callback received after 2 seconds (no audio input?)" << std::endl; + break; + } + } + + capture.stop(); + + if (callback_called) { + std::cout << " ✓ Audio callback test passed" << std::endl; + } else { + std::cout << " ! Audio callback test completed (no audio detected)" << std::endl; + } + } else { + std::cout << " ! Could not start audio capture" << std::endl; + } + } else { + std::cout << " ! Could not initialize audio capture" << std::endl; + } +#else + std::cout << " PortAudio support not available, skipping callback test" << std::endl; +#endif +} + +int main() { + std::cout << "Running FrameForge Audio Capture Tests" << std::endl; + std::cout << "======================================" << std::endl; + + test_audio_config(); + test_audio_capture_initialization(); + test_audio_callback(); + + std::cout << "\nAll tests completed!" << std::endl; + return 0; +} From d26d75e3782b0d6a59f014885ae0495ee983273f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 1 Jan 2026 23:29:28 +0000 Subject: [PATCH 05/16] Address code review feedback: fix Pa_Terminate calls, add signal handling, remove duplicate help line Co-authored-by: TheOriginalBytePlayer <18058224+TheOriginalBytePlayer@users.noreply.github.com> --- tools/frameforge/README.md | 1 - tools/frameforge/frameforge-audio.cpp | 24 ++++++++++++++-------- tools/frameforge/frameforge-sidecar.cpp | 27 +++++++++++++++++++++---- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/tools/frameforge/README.md b/tools/frameforge/README.md index d55fba1f0f2..d1c8003cf39 100644 --- a/tools/frameforge/README.md +++ b/tools/frameforge/README.md @@ -186,7 +186,6 @@ This mode continuously captures audio from the default microphone, processes it - `-t, --threads N` - Number of threads (default: 4) - `-v, --verbose` - Enable verbose output - `-h, --help` - Show help message -- `-h, --help` - Show help message ### Verb Definitions diff --git a/tools/frameforge/frameforge-audio.cpp b/tools/frameforge/frameforge-audio.cpp index d94bf0d2c70..2a5ff57f5b6 100644 --- a/tools/frameforge/frameforge-audio.cpp +++ b/tools/frameforge/frameforge-audio.cpp @@ -11,6 +11,10 @@ namespace frameforge { #ifdef FRAMEFORGE_PORTAUDIO_SUPPORT +// Global PortAudio initialization tracking +static bool g_portaudio_initialized = false; +static int g_portaudio_ref_count = 0; + AudioCapture::AudioCapture(const AudioConfig & config) : config_(config) , callback_(nullptr) @@ -24,21 +28,26 @@ AudioCapture::~AudioCapture() { Pa_CloseStream(static_cast(stream_)); stream_ = nullptr; } - Pa_Terminate(); + // Note: We don't call Pa_Terminate() here as it affects global PortAudio state + // In a production application, Pa_Terminate() should be called once at application shutdown } bool AudioCapture::initialize() { - PaError err = Pa_Initialize(); - if (err != paNoError) { - std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; - return false; + // Initialize PortAudio if not already initialized + if (!g_portaudio_initialized) { + PaError err = Pa_Initialize(); + if (err != paNoError) { + std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; + return false; + } + g_portaudio_initialized = true; } + g_portaudio_ref_count++; // Get default input device PaDeviceIndex device = Pa_GetDefaultInputDevice(); if (device == paNoDevice) { std::cerr << "Error: No default input device found" << std::endl; - Pa_Terminate(); return false; } @@ -59,7 +68,7 @@ bool AudioCapture::initialize() { input_params.hostApiSpecificStreamInfo = nullptr; // Open audio stream - err = Pa_OpenStream( + PaError err = Pa_OpenStream( reinterpret_cast(&stream_), &input_params, nullptr, // no output config_.sample_rate, config_.frames_per_buffer, paClipOff, @@ -67,7 +76,6 @@ bool AudioCapture::initialize() { if (err != paNoError) { std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; - Pa_Terminate(); return false; } diff --git a/tools/frameforge/frameforge-sidecar.cpp b/tools/frameforge/frameforge-sidecar.cpp index 9f88c6ab5f7..84b5a637e02 100644 --- a/tools/frameforge/frameforge-sidecar.cpp +++ b/tools/frameforge/frameforge-sidecar.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -19,6 +20,17 @@ #include #include +// Global flag for signal handling +static std::atomic g_running(true); + +// Signal handler for graceful shutdown +static void signal_handler(int signal) { + if (signal == SIGINT || signal == SIGTERM) { + fprintf(stderr, "\nReceived signal %d, shutting down...\n", signal); + g_running = false; + } +} + // System prompt for Llama intent classification static const char * INTENT_SYSTEM_PROMPT = R"(You are an intent classifier for FrameForge Studio, a professional previsualization software. @@ -481,15 +493,18 @@ int main(int argc, char ** argv) { // Initialize command validator frameforge::CommandValidator validator; + // Set up signal handlers for graceful shutdown + std::signal(SIGINT, signal_handler); + std::signal(SIGTERM, signal_handler); + fprintf(stderr, "FrameForge Sidecar ready. Listening to microphone...\n"); fprintf(stderr, "Press Ctrl+C to stop\n"); // Main loop for live audio - std::atomic running(true); constexpr float MIN_AUDIO_DURATION_SEC = 2.0f; // Process at least 2 seconds of audio const size_t MIN_AUDIO_SAMPLES = static_cast(MIN_AUDIO_DURATION_SEC * audio_config.sample_rate); - while (running) { + while (g_running) { std::this_thread::sleep_for(std::chrono::milliseconds(100)); // Get accumulated audio buffer @@ -573,11 +588,15 @@ int main(int argc, char ** argv) { return 1; } + // Set up signal handlers for graceful shutdown + std::signal(SIGINT, signal_handler); + std::signal(SIGTERM, signal_handler); + fprintf(stderr, "FrameForge Sidecar ready. Waiting for commands...\n"); + fprintf(stderr, "Press Ctrl+C to stop\n"); // Main loop - std::atomic running(true); - while (running) { + while (g_running) { // In a real implementation, this would: // 1. Receive audio data from the IPC pipe // 2. Transcribe with Whisper From 910ac3982d23c10ceb1b1297a2a32dbdd57b99ff Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 1 Jan 2026 23:55:30 +0000 Subject: [PATCH 06/16] Implement VAD-based audio processing: trigger after 500ms speech + 250ms silence Co-authored-by: TheOriginalBytePlayer <18058224+TheOriginalBytePlayer@users.noreply.github.com> --- tools/frameforge/README.md | 6 ++- tools/frameforge/frameforge-audio.cpp | 71 ++++++++++++++++++++++++- tools/frameforge/frameforge-audio.h | 25 +++++++++ tools/frameforge/frameforge-sidecar.cpp | 22 ++++---- 4 files changed, 110 insertions(+), 14 deletions(-) diff --git a/tools/frameforge/README.md b/tools/frameforge/README.md index d1c8003cf39..b0221522485 100644 --- a/tools/frameforge/README.md +++ b/tools/frameforge/README.md @@ -164,7 +164,11 @@ Download and install PortAudio from http://www.portaudio.com/ --verbose ``` -This mode continuously captures audio from the default microphone, processes it in chunks (minimum 2 seconds), transcribes with Whisper, classifies with Llama, and validates commands. The audio buffer is cleared after each processing cycle. +This mode continuously captures audio from the default microphone using Voice Activity Detection (VAD). It automatically detects when speech begins and ends, then processes the audio when: +1. At least 500ms of speech is detected +2. Followed by 250ms of silence + +The captured speech is then transcribed with Whisper, classified with Llama, and validated. The audio buffer is cleared after each processing cycle. ### Server Mode (IPC with Named Pipes) diff --git a/tools/frameforge/frameforge-audio.cpp b/tools/frameforge/frameforge-audio.cpp index 2a5ff57f5b6..0fbfcb039b1 100644 --- a/tools/frameforge/frameforge-audio.cpp +++ b/tools/frameforge/frameforge-audio.cpp @@ -4,8 +4,9 @@ #include #endif -#include +#include #include +#include namespace frameforge { @@ -19,7 +20,18 @@ AudioCapture::AudioCapture(const AudioConfig & config) : config_(config) , callback_(nullptr) , capturing_(false) - , stream_(nullptr) { + , stream_(nullptr) + , ready_to_process_(false) + , has_speech_(false) + , speech_sample_count_(0) + , silence_sample_count_(0) { + // Calculate sample thresholds + min_speech_samples_ = static_cast( + (config_.min_speech_duration_ms / 1000.0f) * config_.sample_rate + ); + silence_samples_threshold_ = static_cast( + (config_.silence_duration_ms / 1000.0f) * config_.sample_rate + ); } AudioCapture::~AudioCapture() { @@ -130,6 +142,31 @@ void AudioCapture::clear_buffer() { audio_buffer_.clear(); } +void AudioCapture::reset_vad_state() { + ready_to_process_ = false; + has_speech_ = false; + speech_sample_count_ = 0; + silence_sample_count_ = 0; +} + +float AudioCapture::calculate_rms(const float * data, size_t sample_count) const { + if (!data || sample_count == 0) { + return 0.0f; + } + + float sum_squares = 0.0f; + for (size_t i = 0; i < sample_count; ++i) { + sum_squares += data[i] * data[i]; + } + + return std::sqrt(sum_squares / sample_count); +} + +bool AudioCapture::is_speech(const float * data, size_t sample_count) const { + float rms = calculate_rms(data, sample_count); + return rms > config_.vad_threshold; +} + int AudioCapture::pa_callback(const void * input, void * output, unsigned long frame_count, const void * time_info, unsigned long status_flags, void * user_data) { (void) output; @@ -160,6 +197,36 @@ void AudioCapture::handle_audio_data(const float * data, unsigned long frame_cou audio_buffer_.insert(audio_buffer_.end(), data, data + total_samples); } + // Perform voice activity detection + bool current_is_speech = is_speech(data, total_samples); + + if (current_is_speech) { + // We have speech + speech_sample_count_ += total_samples; + silence_sample_count_ = 0; // Reset silence counter + + // Mark that we've detected speech + if (speech_sample_count_ >= min_speech_samples_) { + has_speech_ = true; + } + } else { + // We have silence + if (has_speech_) { + // We had speech before, now counting silence + silence_sample_count_ += total_samples; + + // Check if we've had enough silence to trigger processing + if (silence_sample_count_ >= silence_samples_threshold_) { + ready_to_process_ = true; + } + } + // If we don't have speech yet, keep resetting counters + else { + speech_sample_count_ = 0; + silence_sample_count_ = 0; + } + } + // Call user callback if set if (callback_) { std::vector callback_data(data, data + total_samples); diff --git a/tools/frameforge/frameforge-audio.h b/tools/frameforge/frameforge-audio.h index 1255d984159..0c08f2f5dac 100644 --- a/tools/frameforge/frameforge-audio.h +++ b/tools/frameforge/frameforge-audio.h @@ -13,6 +13,9 @@ struct AudioConfig { int sample_rate = 16000; // Sample rate in Hz (16kHz is standard for Whisper) int channels = 1; // Number of channels (1 = mono) int frames_per_buffer = 512; // Number of frames per buffer + float vad_threshold = 0.01f; // Voice activity detection threshold (RMS) + float min_speech_duration_ms = 500.0f; // Minimum speech duration in milliseconds + float silence_duration_ms = 250.0f; // Silence duration to trigger processing }; // Audio capture callback function type @@ -48,6 +51,12 @@ class AudioCapture { // Clear the audio buffer void clear_buffer(); + + // Check if ready to process (speech detected + silence after) + bool is_ready_to_process() const { return ready_to_process_; } + + // Reset VAD state + void reset_vad_state(); private: AudioConfig config_; @@ -56,6 +65,14 @@ class AudioCapture { std::vector audio_buffer_; std::mutex buffer_mutex_; void * stream_; // PaStream* (opaque pointer to avoid including portaudio.h here) + + // VAD state tracking + std::atomic ready_to_process_; + std::atomic has_speech_; + std::atomic speech_sample_count_; + std::atomic silence_sample_count_; + size_t min_speech_samples_; + size_t silence_samples_threshold_; // PortAudio callback (static function) // Uses void* for time_info to avoid including portaudio.h in the header @@ -64,6 +81,12 @@ class AudioCapture { // Instance callback handler void handle_audio_data(const float * data, unsigned long frame_count); + + // Calculate RMS (root mean square) of audio data + float calculate_rms(const float * data, size_t sample_count) const; + + // Check if audio chunk contains speech + bool is_speech(const float * data, size_t sample_count) const; }; #else @@ -81,6 +104,8 @@ class AudioCapture { void set_callback(AudioCallback callback) { (void) callback; } std::vector get_audio_buffer() { return std::vector(); } void clear_buffer() {} + bool is_ready_to_process() const { return false; } + void reset_vad_state() {} }; #endif diff --git a/tools/frameforge/frameforge-sidecar.cpp b/tools/frameforge/frameforge-sidecar.cpp index 84b5a637e02..34bcacbeeea 100644 --- a/tools/frameforge/frameforge-sidecar.cpp +++ b/tools/frameforge/frameforge-sidecar.cpp @@ -499,19 +499,18 @@ int main(int argc, char ** argv) { fprintf(stderr, "FrameForge Sidecar ready. Listening to microphone...\n"); fprintf(stderr, "Press Ctrl+C to stop\n"); + fprintf(stderr, "VAD settings: min_speech=%.0fms, silence=%.0fms\n", + audio_config.min_speech_duration_ms, audio_config.silence_duration_ms); - // Main loop for live audio - constexpr float MIN_AUDIO_DURATION_SEC = 2.0f; // Process at least 2 seconds of audio - const size_t MIN_AUDIO_SAMPLES = static_cast(MIN_AUDIO_DURATION_SEC * audio_config.sample_rate); - + // Main loop for live audio with VAD while (g_running) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - - // Get accumulated audio buffer - std::vector audio_buffer = audio_capture.get_audio_buffer(); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); - // Check if we have enough audio to process - if (audio_buffer.size() >= MIN_AUDIO_SAMPLES) { + // Check if VAD has detected speech followed by silence + if (audio_capture.is_ready_to_process()) { + // Get accumulated audio buffer + std::vector audio_buffer = audio_capture.get_audio_buffer(); + fprintf(stderr, "\nProcessing %.2f seconds of audio...\n", static_cast(audio_buffer.size()) / audio_config.sample_rate); @@ -547,8 +546,9 @@ int main(int argc, char ** argv) { fprintf(stderr, "Error: Audio transcription requires Whisper support\n"); #endif - // Clear the buffer after processing + // Clear the buffer and reset VAD state after processing audio_capture.clear_buffer(); + audio_capture.reset_vad_state(); fprintf(stderr, "\nListening...\n"); } From cd2d75a9c9f9ae4f13bb7abdccb7b57c1e85da1e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 1 Jan 2026 23:58:34 +0000 Subject: [PATCH 07/16] Fix race conditions in VAD state management with mutex protection Co-authored-by: TheOriginalBytePlayer <18058224+TheOriginalBytePlayer@users.noreply.github.com> --- tools/frameforge/frameforge-audio.cpp | 49 +++++++++++++++------------ tools/frameforge/frameforge-audio.h | 16 +++++---- 2 files changed, 38 insertions(+), 27 deletions(-) diff --git a/tools/frameforge/frameforge-audio.cpp b/tools/frameforge/frameforge-audio.cpp index 0fbfcb039b1..bc59e30ad60 100644 --- a/tools/frameforge/frameforge-audio.cpp +++ b/tools/frameforge/frameforge-audio.cpp @@ -4,6 +4,7 @@ #include #endif +#include #include #include #include @@ -143,6 +144,7 @@ void AudioCapture::clear_buffer() { } void AudioCapture::reset_vad_state() { + std::lock_guard lock(vad_mutex_); ready_to_process_ = false; has_speech_ = false; speech_sample_count_ = 0; @@ -200,30 +202,35 @@ void AudioCapture::handle_audio_data(const float * data, unsigned long frame_cou // Perform voice activity detection bool current_is_speech = is_speech(data, total_samples); - if (current_is_speech) { - // We have speech - speech_sample_count_ += total_samples; - silence_sample_count_ = 0; // Reset silence counter + // Update VAD state atomically + { + std::lock_guard lock(vad_mutex_); - // Mark that we've detected speech - if (speech_sample_count_ >= min_speech_samples_) { - has_speech_ = true; - } - } else { - // We have silence - if (has_speech_) { - // We had speech before, now counting silence - silence_sample_count_ += total_samples; + if (current_is_speech) { + // We have speech + speech_sample_count_ += total_samples; + silence_sample_count_ = 0; // Reset silence counter - // Check if we've had enough silence to trigger processing - if (silence_sample_count_ >= silence_samples_threshold_) { - ready_to_process_ = true; + // Mark that we've detected speech + if (speech_sample_count_ >= min_speech_samples_) { + has_speech_ = true; + } + } else { + // We have silence + if (has_speech_) { + // We had speech before, now counting silence + silence_sample_count_ += total_samples; + + // Check if we've had enough silence to trigger processing + if (silence_sample_count_ >= silence_samples_threshold_) { + ready_to_process_ = true; + } + } + // If we don't have speech yet, keep resetting counters + else { + speech_sample_count_ = 0; + silence_sample_count_ = 0; } - } - // If we don't have speech yet, keep resetting counters - else { - speech_sample_count_ = 0; - silence_sample_count_ = 0; } } diff --git a/tools/frameforge/frameforge-audio.h b/tools/frameforge/frameforge-audio.h index 0c08f2f5dac..84ea96db401 100644 --- a/tools/frameforge/frameforge-audio.h +++ b/tools/frameforge/frameforge-audio.h @@ -53,7 +53,10 @@ class AudioCapture { void clear_buffer(); // Check if ready to process (speech detected + silence after) - bool is_ready_to_process() const { return ready_to_process_; } + bool is_ready_to_process() const { + std::lock_guard lock(const_cast(vad_mutex_)); + return ready_to_process_; + } // Reset VAD state void reset_vad_state(); @@ -66,11 +69,12 @@ class AudioCapture { std::mutex buffer_mutex_; void * stream_; // PaStream* (opaque pointer to avoid including portaudio.h here) - // VAD state tracking - std::atomic ready_to_process_; - std::atomic has_speech_; - std::atomic speech_sample_count_; - std::atomic silence_sample_count_; + // VAD state tracking (protected by vad_mutex_) + std::mutex vad_mutex_; + bool ready_to_process_; + bool has_speech_; + size_t speech_sample_count_; + size_t silence_sample_count_; size_t min_speech_samples_; size_t silence_samples_threshold_; From 21f63f533ebe512e93a91a535b86bc4d3cdd9f78 Mon Sep 17 00:00:00 2001 From: TheOriginalBytePlayer Date: Thu, 1 Jan 2026 16:16:11 -0800 Subject: [PATCH 08/16] Update tools/frameforge/frameforge-audio.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tools/frameforge/frameforge-audio.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/frameforge/frameforge-audio.cpp b/tools/frameforge/frameforge-audio.cpp index bc59e30ad60..0ce4aa8abce 100644 --- a/tools/frameforge/frameforge-audio.cpp +++ b/tools/frameforge/frameforge-audio.cpp @@ -135,7 +135,9 @@ void AudioCapture::set_callback(AudioCallback callback) { std::vector AudioCapture::get_audio_buffer() { std::lock_guard lock(buffer_mutex_); - return audio_buffer_; + std::vector buffer_copy; + buffer_copy.swap(audio_buffer_); + return buffer_copy; } void AudioCapture::clear_buffer() { From dd7d7cb01773ab8f7c4dc1d28c498df5b7d14470 Mon Sep 17 00:00:00 2001 From: TheOriginalBytePlayer Date: Thu, 1 Jan 2026 16:16:39 -0800 Subject: [PATCH 09/16] Update tools/frameforge/frameforge-audio.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tools/frameforge/frameforge-audio.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/frameforge/frameforge-audio.cpp b/tools/frameforge/frameforge-audio.cpp index 0ce4aa8abce..7192a4d370d 100644 --- a/tools/frameforge/frameforge-audio.cpp +++ b/tools/frameforge/frameforge-audio.cpp @@ -122,7 +122,11 @@ void AudioCapture::stop() { } if (stream_) { - Pa_StopStream(static_cast(stream_)); + PaError err = Pa_StopStream(static_cast(stream_)); + if (err != paNoError) { + std::cerr << "PortAudio error when stopping stream: " << Pa_GetErrorText(err) << std::endl; + return; + } } capturing_ = false; From 6f8b9f6b556682c4a0dcf19ecba5b46a910daf2f Mon Sep 17 00:00:00 2001 From: TheOriginalBytePlayer Date: Thu, 1 Jan 2026 16:17:12 -0800 Subject: [PATCH 10/16] Update tools/frameforge/frameforge-sidecar.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tools/frameforge/frameforge-sidecar.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/frameforge/frameforge-sidecar.cpp b/tools/frameforge/frameforge-sidecar.cpp index 34bcacbeeea..3c62b8f1bc6 100644 --- a/tools/frameforge/frameforge-sidecar.cpp +++ b/tools/frameforge/frameforge-sidecar.cpp @@ -490,9 +490,6 @@ int main(int argc, char ** argv) { return 1; } - // Initialize command validator - frameforge::CommandValidator validator; - // Set up signal handlers for graceful shutdown std::signal(SIGINT, signal_handler); std::signal(SIGTERM, signal_handler); From 69833a9822d0de7885f1249553389fdb148bb2f6 Mon Sep 17 00:00:00 2001 From: TheOriginalBytePlayer Date: Thu, 1 Jan 2026 16:17:24 -0800 Subject: [PATCH 11/16] Update tests/test-frameforge-audio.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/test-frameforge-audio.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test-frameforge-audio.cpp b/tests/test-frameforge-audio.cpp index 371fcf81874..78c95945397 100644 --- a/tests/test-frameforge-audio.cpp +++ b/tests/test-frameforge-audio.cpp @@ -45,11 +45,17 @@ static void test_audio_capture_initialization() { if (init_result) { std::cout << " ✓ Audio capture initialization succeeded" << std::endl; + // Follow typical usage pattern: start capturing before testing buffer operations. + capture.start(); + // Test buffer operations capture.clear_buffer(); std::vector buffer = capture.get_audio_buffer(); assert(buffer.empty()); std::cout << " ✓ Buffer operations work" << std::endl; + + // Stop capturing to complete the typical lifecycle. + capture.stop(); } else { std::cout << " ! Audio capture initialization failed (this is OK if no audio device is available)" << std::endl; } From d3a5a0d8b7583723272b8489556487ca6f91a66e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 2 Jan 2026 00:34:01 +0000 Subject: [PATCH 12/16] Fix multiple initialize() calls and add thread-safe PortAudio initialization Co-authored-by: TheOriginalBytePlayer <18058224+TheOriginalBytePlayer@users.noreply.github.com> --- tools/frameforge/frameforge-audio.cpp | 68 +++++++++++++++++++++------ tools/frameforge/frameforge-audio.h | 1 + 2 files changed, 55 insertions(+), 14 deletions(-) diff --git a/tools/frameforge/frameforge-audio.cpp b/tools/frameforge/frameforge-audio.cpp index 7192a4d370d..01462ba3d8e 100644 --- a/tools/frameforge/frameforge-audio.cpp +++ b/tools/frameforge/frameforge-audio.cpp @@ -13,7 +13,8 @@ namespace frameforge { #ifdef FRAMEFORGE_PORTAUDIO_SUPPORT -// Global PortAudio initialization tracking +// Global PortAudio initialization tracking (protected by mutex) +static std::mutex g_portaudio_mutex; static bool g_portaudio_initialized = false; static int g_portaudio_ref_count = 0; @@ -22,6 +23,7 @@ AudioCapture::AudioCapture(const AudioConfig & config) , callback_(nullptr) , capturing_(false) , stream_(nullptr) + , initialized_(false) , ready_to_process_(false) , has_speech_(false) , speech_sample_count_(0) @@ -41,26 +43,48 @@ AudioCapture::~AudioCapture() { Pa_CloseStream(static_cast(stream_)); stream_ = nullptr; } - // Note: We don't call Pa_Terminate() here as it affects global PortAudio state - // In a production application, Pa_Terminate() should be called once at application shutdown + + // Decrement reference count and terminate PortAudio if needed + if (initialized_) { + std::lock_guard lock(g_portaudio_mutex); + g_portaudio_ref_count--; + if (g_portaudio_ref_count == 0) { + Pa_Terminate(); + g_portaudio_initialized = false; + } + } } bool AudioCapture::initialize() { - // Initialize PortAudio if not already initialized - if (!g_portaudio_initialized) { - PaError err = Pa_Initialize(); - if (err != paNoError) { - std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; - return false; + // Check if already initialized + if (initialized_) { + std::cerr << "Warning: AudioCapture already initialized" << std::endl; + return true; + } + + // Initialize PortAudio if not already initialized (thread-safe) + { + std::lock_guard lock(g_portaudio_mutex); + if (!g_portaudio_initialized) { + PaError err = Pa_Initialize(); + if (err != paNoError) { + std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; + return false; + } + g_portaudio_initialized = true; } - g_portaudio_initialized = true; + g_portaudio_ref_count++; } - g_portaudio_ref_count++; + initialized_ = true; // Get default input device PaDeviceIndex device = Pa_GetDefaultInputDevice(); if (device == paNoDevice) { std::cerr << "Error: No default input device found" << std::endl; + // Decrement ref count on error + std::lock_guard lock(g_portaudio_mutex); + g_portaudio_ref_count--; + initialized_ = false; return false; } @@ -70,6 +94,13 @@ bool AudioCapture::initialize() { std::cout << "Using audio device: " << device_info->name << std::endl; std::cout << " Sample rate: " << config_.sample_rate << " Hz" << std::endl; std::cout << " Channels: " << config_.channels << std::endl; + } else { + std::cerr << "Error: Could not get device info" << std::endl; + // Decrement ref count on error + std::lock_guard lock(g_portaudio_mutex); + g_portaudio_ref_count--; + initialized_ = false; + return false; } // Set up stream parameters @@ -81,17 +112,23 @@ bool AudioCapture::initialize() { input_params.hostApiSpecificStreamInfo = nullptr; // Open audio stream + PaStream * pa_stream = nullptr; PaError err = Pa_OpenStream( - reinterpret_cast(&stream_), &input_params, + &pa_stream, &input_params, nullptr, // no output config_.sample_rate, config_.frames_per_buffer, paClipOff, reinterpret_cast(pa_callback), this); if (err != paNoError) { std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; + // Decrement ref count on error + std::lock_guard lock(g_portaudio_mutex); + g_portaudio_ref_count--; + initialized_ = false; return false; } - + + stream_ = pa_stream; return true; } @@ -241,8 +278,11 @@ void AudioCapture::handle_audio_data(const float * data, unsigned long frame_cou } // Call user callback if set + // Note: Creating vector from raw data involves a copy, but this matches + // the callback signature. For high-performance use cases, consider using + // the internal buffer directly or redesigning the callback interface. if (callback_) { - std::vector callback_data(data, data + total_samples); + const std::vector callback_data(data, data + total_samples); callback_(callback_data); } } diff --git a/tools/frameforge/frameforge-audio.h b/tools/frameforge/frameforge-audio.h index 84ea96db401..044d97130a1 100644 --- a/tools/frameforge/frameforge-audio.h +++ b/tools/frameforge/frameforge-audio.h @@ -68,6 +68,7 @@ class AudioCapture { std::vector audio_buffer_; std::mutex buffer_mutex_; void * stream_; // PaStream* (opaque pointer to avoid including portaudio.h here) + bool initialized_; // Track if this instance has initialized PortAudio // VAD state tracking (protected by vad_mutex_) std::mutex vad_mutex_; From 07ab1bfab67114b5b9548455464b18e9ce8ed6d5 Mon Sep 17 00:00:00 2001 From: TheOriginalBytePlayer Date: Thu, 1 Jan 2026 17:36:37 -0800 Subject: [PATCH 13/16] Update tools/frameforge/frameforge-sidecar.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tools/frameforge/frameforge-sidecar.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/frameforge/frameforge-sidecar.cpp b/tools/frameforge/frameforge-sidecar.cpp index 3c62b8f1bc6..16e4b19eeb4 100644 --- a/tools/frameforge/frameforge-sidecar.cpp +++ b/tools/frameforge/frameforge-sidecar.cpp @@ -26,7 +26,6 @@ static std::atomic g_running(true); // Signal handler for graceful shutdown static void signal_handler(int signal) { if (signal == SIGINT || signal == SIGTERM) { - fprintf(stderr, "\nReceived signal %d, shutting down...\n", signal); g_running = false; } } From 82776bedd11d4778486ac1c3d4c216d139306f93 Mon Sep 17 00:00:00 2001 From: TheOriginalBytePlayer Date: Thu, 1 Jan 2026 17:39:09 -0800 Subject: [PATCH 14/16] Update tools/frameforge/frameforge-audio.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tools/frameforge/frameforge-audio.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/frameforge/frameforge-audio.cpp b/tools/frameforge/frameforge-audio.cpp index 01462ba3d8e..c8ff6d9b735 100644 --- a/tools/frameforge/frameforge-audio.cpp +++ b/tools/frameforge/frameforge-audio.cpp @@ -117,7 +117,7 @@ bool AudioCapture::initialize() { &pa_stream, &input_params, nullptr, // no output config_.sample_rate, config_.frames_per_buffer, paClipOff, - reinterpret_cast(pa_callback), this); + pa_callback, this); if (err != paNoError) { std::cerr << "PortAudio error: " << Pa_GetErrorText(err) << std::endl; From 3b7e3b47c4876851a2955ec6f0d443c27497d931 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 2 Jan 2026 01:45:08 +0000 Subject: [PATCH 15/16] Require both PortAudio and Whisper for live audio mode at compile time Co-authored-by: TheOriginalBytePlayer <18058224+TheOriginalBytePlayer@users.noreply.github.com> --- tools/frameforge/frameforge-sidecar.cpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/tools/frameforge/frameforge-sidecar.cpp b/tools/frameforge/frameforge-sidecar.cpp index 16e4b19eeb4..a8a1f0248ad 100644 --- a/tools/frameforge/frameforge-sidecar.cpp +++ b/tools/frameforge/frameforge-sidecar.cpp @@ -458,7 +458,7 @@ int main(int argc, char ** argv) { } // Live audio capture mode: capture audio from microphone -#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT +#if defined(FRAMEFORGE_PORTAUDIO_SUPPORT) && defined(FRAMEFORGE_WHISPER_SUPPORT) if (params.live_audio) { fprintf(stderr, "Starting live audio capture mode...\n"); @@ -473,9 +473,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "Error: Failed to initialize audio capture\n"); llama_free(lctx); llama_model_free(model); -#ifdef FRAMEFORGE_WHISPER_SUPPORT whisper_free(wctx); -#endif return 1; } @@ -483,9 +481,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "Error: Failed to start audio capture\n"); llama_free(lctx); llama_model_free(model); -#ifdef FRAMEFORGE_WHISPER_SUPPORT whisper_free(wctx); -#endif return 1; } @@ -510,7 +506,6 @@ int main(int argc, char ** argv) { fprintf(stderr, "\nProcessing %.2f seconds of audio...\n", static_cast(audio_buffer.size()) / audio_config.sample_rate); -#ifdef FRAMEFORGE_WHISPER_SUPPORT // Transcribe audio fprintf(stderr, "Transcribing audio...\n"); std::string transcription = transcribe_audio(wctx, audio_buffer, params.verbose); @@ -538,9 +533,6 @@ int main(int argc, char ** argv) { } else { fprintf(stderr, "No transcription generated (silence or noise)\n"); } -#else - fprintf(stderr, "Error: Audio transcription requires Whisper support\n"); -#endif // Clear the buffer and reset VAD state after processing audio_capture.clear_buffer(); @@ -553,14 +545,12 @@ int main(int argc, char ** argv) { audio_capture.stop(); llama_free(lctx); llama_model_free(model); -#ifdef FRAMEFORGE_WHISPER_SUPPORT whisper_free(wctx); -#endif return 0; } #else if (params.live_audio) { - fprintf(stderr, "Error: Live audio capture requires PortAudio support (not compiled)\n"); + fprintf(stderr, "Error: Live audio capture requires both PortAudio and Whisper support\n"); llama_free(lctx); llama_model_free(model); #ifdef FRAMEFORGE_WHISPER_SUPPORT From cf18ae66eaae4689bf7341a9e1826e730d4c65c1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 2 Jan 2026 01:56:47 +0000 Subject: [PATCH 16/16] Address remaining review feedback: update docs, fix signal handlers, add buffer size limit, fix callback signature Co-authored-by: TheOriginalBytePlayer <18058224+TheOriginalBytePlayer@users.noreply.github.com> --- tools/frameforge/README.md | 2 +- tools/frameforge/frameforge-audio.cpp | 17 +++++++++++++++-- tools/frameforge/frameforge-audio.h | 12 ++++++++++-- tools/frameforge/frameforge-sidecar.cpp | 12 ++++-------- 4 files changed, 30 insertions(+), 13 deletions(-) diff --git a/tools/frameforge/README.md b/tools/frameforge/README.md index b0221522485..1fe25a8cf15 100644 --- a/tools/frameforge/README.md +++ b/tools/frameforge/README.md @@ -154,7 +154,7 @@ Download and install PortAudio from http://www.portaudio.com/ ### Live Audio Capture Mode (with microphone) -**Requires PortAudio support (see Building section)** +**Requires both PortAudio and Whisper support (see Building section)** ```bash ./build/bin/frameforge-sidecar \ diff --git a/tools/frameforge/frameforge-audio.cpp b/tools/frameforge/frameforge-audio.cpp index c8ff6d9b735..8929fc5a616 100644 --- a/tools/frameforge/frameforge-audio.cpp +++ b/tools/frameforge/frameforge-audio.cpp @@ -35,6 +35,9 @@ AudioCapture::AudioCapture(const AudioConfig & config) silence_samples_threshold_ = static_cast( (config_.silence_duration_ms / 1000.0f) * config_.sample_rate ); + max_buffer_samples_ = static_cast( + config_.max_buffer_duration_s * config_.sample_rate * config_.channels + ); } AudioCapture::~AudioCapture() { @@ -213,7 +216,8 @@ bool AudioCapture::is_speech(const float * data, size_t sample_count) const { } int AudioCapture::pa_callback(const void * input, void * output, unsigned long frame_count, - const void * time_info, unsigned long status_flags, void * user_data) { + const PaStreamCallbackTimeInfo * time_info, + PaStreamCallbackFlags status_flags, void * user_data) { (void) output; (void) time_info; (void) status_flags; @@ -236,9 +240,18 @@ void AudioCapture::handle_audio_data(const float * data, unsigned long frame_cou // Calculate total samples (frames * channels) size_t total_samples = frame_count * config_.channels; - // Store in buffer + // Store in buffer with size check to prevent unbounded growth { std::lock_guard lock(buffer_mutex_); + + // Check if adding this data would exceed maximum buffer size + if (audio_buffer_.size() + total_samples > max_buffer_samples_) { + // Buffer is too large - remove oldest samples to make room + // This implements a rolling window approach + size_t samples_to_remove = (audio_buffer_.size() + total_samples) - max_buffer_samples_; + audio_buffer_.erase(audio_buffer_.begin(), audio_buffer_.begin() + samples_to_remove); + } + audio_buffer_.insert(audio_buffer_.end(), data, data + total_samples); } diff --git a/tools/frameforge/frameforge-audio.h b/tools/frameforge/frameforge-audio.h index 044d97130a1..555f00ed401 100644 --- a/tools/frameforge/frameforge-audio.h +++ b/tools/frameforge/frameforge-audio.h @@ -6,6 +6,12 @@ #include #include +// Forward declarations for PortAudio types +#ifdef FRAMEFORGE_PORTAUDIO_SUPPORT +struct PaStreamCallbackTimeInfo; +typedef unsigned long PaStreamCallbackFlags; +#endif + namespace frameforge { // Audio capture configuration @@ -16,6 +22,7 @@ struct AudioConfig { float vad_threshold = 0.01f; // Voice activity detection threshold (RMS) float min_speech_duration_ms = 500.0f; // Minimum speech duration in milliseconds float silence_duration_ms = 250.0f; // Silence duration to trigger processing + float max_buffer_duration_s = 30.0f; // Maximum buffer duration in seconds (prevents unbounded growth) }; // Audio capture callback function type @@ -78,11 +85,12 @@ class AudioCapture { size_t silence_sample_count_; size_t min_speech_samples_; size_t silence_samples_threshold_; + size_t max_buffer_samples_; // Maximum buffer size in samples // PortAudio callback (static function) - // Uses void* for time_info to avoid including portaudio.h in the header static int pa_callback(const void * input, void * output, unsigned long frame_count, - const void * time_info, unsigned long status_flags, void * user_data); + const PaStreamCallbackTimeInfo * time_info, + PaStreamCallbackFlags status_flags, void * user_data); // Instance callback handler void handle_audio_data(const float * data, unsigned long frame_count); diff --git a/tools/frameforge/frameforge-sidecar.cpp b/tools/frameforge/frameforge-sidecar.cpp index a8a1f0248ad..35c286ca627 100644 --- a/tools/frameforge/frameforge-sidecar.cpp +++ b/tools/frameforge/frameforge-sidecar.cpp @@ -352,6 +352,10 @@ int main(int argc, char ** argv) { return 1; } + // Set up signal handlers for graceful shutdown (once at the start) + std::signal(SIGINT, signal_handler); + std::signal(SIGTERM, signal_handler); + // Load verb definitions if provided if (!params.verb_definitions_file.empty()) { fprintf(stderr, "Loading verb definitions from: %s\n", params.verb_definitions_file.c_str()); @@ -485,10 +489,6 @@ int main(int argc, char ** argv) { return 1; } - // Set up signal handlers for graceful shutdown - std::signal(SIGINT, signal_handler); - std::signal(SIGTERM, signal_handler); - fprintf(stderr, "FrameForge Sidecar ready. Listening to microphone...\n"); fprintf(stderr, "Press Ctrl+C to stop\n"); fprintf(stderr, "VAD settings: min_speech=%.0fms, silence=%.0fms\n", @@ -574,10 +574,6 @@ int main(int argc, char ** argv) { return 1; } - // Set up signal handlers for graceful shutdown - std::signal(SIGINT, signal_handler); - std::signal(SIGTERM, signal_handler); - fprintf(stderr, "FrameForge Sidecar ready. Waiting for commands...\n"); fprintf(stderr, "Press Ctrl+C to stop\n");