diff --git a/cmd/mnemonic/runtime.go b/cmd/mnemonic/runtime.go
index b65332d2..63434667 100644
--- a/cmd/mnemonic/runtime.go
+++ b/cmd/mnemonic/runtime.go
@@ -187,9 +187,26 @@ func buildEncodingConfig(cfg *config.Config) encoding.EncodingConfig {
}
}
+// newAPIProvider creates an API-based LLM provider from config.
+func newAPIProvider(cfg *config.Config) llm.Provider {
+ timeout := time.Duration(cfg.LLM.TimeoutSec) * time.Second
+ if timeout == 0 {
+ timeout = 30 * time.Second
+ }
+ return llm.NewLMStudioProvider(
+ cfg.LLM.Endpoint,
+ cfg.LLM.ChatModel,
+ cfg.LLM.EmbeddingModel,
+ cfg.LLM.APIKey,
+ timeout,
+ cfg.LLM.MaxConcurrent,
+ )
+}
+
// newLLMProvider creates the appropriate LLM provider based on config.
// For "api" (default), it creates an LMStudioProvider for OpenAI-compatible APIs.
-// For "embedded", it creates an EmbeddedProvider for in-process llama.cpp inference.
+// For "embedded", it creates a SwitchableProvider with embedded as primary
+// and API as a fallback that can be toggled at runtime.
func newLLMProvider(cfg *config.Config) llm.Provider {
switch cfg.LLM.Provider {
case "embedded":
@@ -215,19 +232,15 @@ func newLLMProvider(cfg *config.Config) llm.Provider {
} else {
slog.Warn("embedded provider selected but llama.cpp not compiled in (build with: make build-embedded)")
}
- return ep
- default: // "api" or ""
- timeout := time.Duration(cfg.LLM.TimeoutSec) * time.Second
- if timeout == 0 {
- timeout = 30 * time.Second
+
+ // Create API provider as runtime fallback (Gemini, etc.)
+ var apiProvider llm.Provider
+ if cfg.LLM.Endpoint != "" {
+ apiProvider = newAPIProvider(cfg)
}
- return llm.NewLMStudioProvider(
- cfg.LLM.Endpoint,
- cfg.LLM.ChatModel,
- cfg.LLM.EmbeddingModel,
- cfg.LLM.APIKey,
- timeout,
- cfg.LLM.MaxConcurrent,
- )
+
+ return llm.NewSwitchableProvider(ep, apiProvider, cfg.LLM.ChatModel)
+ default: // "api" or ""
+ return newAPIProvider(cfg)
}
}
diff --git a/cmd/mnemonic/serve.go b/cmd/mnemonic/serve.go
index 759a08ee..62335dc5 100644
--- a/cmd/mnemonic/serve.go
+++ b/cmd/mnemonic/serve.go
@@ -663,6 +663,12 @@ func serveCommand(configPath string) {
StartTime: time.Now(),
Log: log,
}
+ // Wire model manager if using switchable/embedded provider
+ if sp, ok := llmProvider.(*llm.SwitchableProvider); ok {
+ apiDeps.ModelManager = sp
+ } else if ep, ok := llmProvider.(*llm.EmbeddedProvider); ok {
+ apiDeps.ModelManager = ep
+ }
// Only set Consolidator if it's non-nil (avoids Go nil-interface trap)
if consolidator != nil {
apiDeps.Consolidator = consolidator
diff --git a/internal/api/routes/models.go b/internal/api/routes/models.go
new file mode 100644
index 00000000..4ea536d6
--- /dev/null
+++ b/internal/api/routes/models.go
@@ -0,0 +1,119 @@
+package routes
+
+import (
+ "encoding/json"
+ "log/slog"
+ "net/http"
+
+ "github.com/appsprout-dev/mnemonic/internal/llm"
+)
+
+// HandleListModels returns available GGUF models in the models directory.
+func HandleListModels(mgr llm.ModelManager, log *slog.Logger) http.HandlerFunc {
+ return func(w http.ResponseWriter, r *http.Request) {
+ if mgr == nil {
+ writeJSON(w, http.StatusOK, map[string]interface{}{
+ "models": []interface{}{},
+ "enabled": false,
+ "message": "embedded provider not active",
+ })
+ return
+ }
+
+ models, err := mgr.ListAvailableModels()
+ if err != nil {
+ log.Error("failed to list models", "error", err)
+ writeError(w, http.StatusInternalServerError, "failed to list models: "+err.Error(), "MODEL_ERROR")
+ return
+ }
+
+ active := mgr.ActiveModel()
+
+ writeJSON(w, http.StatusOK, map[string]interface{}{
+ "models": models,
+ "active": active,
+ "enabled": true,
+ "mode": mgr.ProviderMode(),
+ })
+ }
+}
+
+// HandleActiveModel returns the currently loaded model status.
+func HandleActiveModel(mgr llm.ModelManager, log *slog.Logger) http.HandlerFunc {
+ return func(w http.ResponseWriter, r *http.Request) {
+ if mgr == nil {
+ writeJSON(w, http.StatusOK, map[string]interface{}{
+ "enabled": false,
+ "message": "embedded provider not active",
+ })
+ return
+ }
+
+ active := mgr.ActiveModel()
+ writeJSON(w, http.StatusOK, map[string]interface{}{
+ "active": active,
+ "enabled": true,
+ })
+ }
+}
+
+// swapModelRequest is the JSON body for POST /api/v1/models/active.
+type swapModelRequest struct {
+ ChatModel string `json:"chat_model"`
+ EmbedModel string `json:"embed_model"`
+ Mode string `json:"mode"` // "embedded" or "api" — switches provider
+}
+
+// HandleSwapModel hot-swaps the active chat or embedding model.
+func HandleSwapModel(mgr llm.ModelManager, log *slog.Logger) http.HandlerFunc {
+ return func(w http.ResponseWriter, r *http.Request) {
+ if mgr == nil {
+ writeError(w, http.StatusBadRequest, "embedded provider not active — model swap unavailable", "MODEL_ERROR")
+ return
+ }
+
+ var req swapModelRequest
+ if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
+ writeError(w, http.StatusBadRequest, "invalid JSON body: "+err.Error(), "INVALID_PARAM")
+ return
+ }
+
+ if req.ChatModel == "" && req.EmbedModel == "" && req.Mode == "" {
+ writeError(w, http.StatusBadRequest, "specify chat_model, embed_model, or mode", "INVALID_PARAM")
+ return
+ }
+
+ if req.Mode != "" {
+ log.Info("switching provider mode", "mode", req.Mode)
+ if err := mgr.SetProviderMode(req.Mode); err != nil {
+ log.Error("failed to switch provider mode", "error", err)
+ writeError(w, http.StatusBadRequest, "failed to switch mode: "+err.Error(), "MODEL_ERROR")
+ return
+ }
+ }
+
+ if req.ChatModel != "" {
+ log.Info("swapping chat model", "model", req.ChatModel)
+ if err := mgr.SwapChatModel(req.ChatModel); err != nil {
+ log.Error("failed to swap chat model", "error", err)
+ writeError(w, http.StatusInternalServerError, "failed to swap chat model: "+err.Error(), "MODEL_ERROR")
+ return
+ }
+ }
+
+ if req.EmbedModel != "" {
+ log.Info("swapping embed model", "model", req.EmbedModel)
+ if err := mgr.SwapEmbedModel(req.EmbedModel); err != nil {
+ log.Error("failed to swap embed model", "error", err)
+ writeError(w, http.StatusInternalServerError, "failed to swap embed model: "+err.Error(), "MODEL_ERROR")
+ return
+ }
+ }
+
+ active := mgr.ActiveModel()
+ writeJSON(w, http.StatusOK, map[string]interface{}{
+ "status": "ok",
+ "active": active,
+ })
+ }
+}
diff --git a/internal/api/server.go b/internal/api/server.go
index 64bb97ce..9ea6528d 100644
--- a/internal/api/server.go
+++ b/internal/api/server.go
@@ -30,6 +30,7 @@ type ServerConfig struct {
type ServerDeps struct {
Store store.Store
LLM llm.Provider
+ ModelManager llm.ModelManager // can be nil if not using embedded provider
Bus events.Bus
Retriever *retrieval.RetrievalAgent
Consolidator routes.ConsolidationRunner // can be nil if disabled
@@ -132,6 +133,11 @@ func (s *Server) registerRoutes() {
s.mux.HandleFunc("GET /api/v1/abstractions", routes.HandleListAbstractions(s.deps.Store, s.deps.Log))
s.mux.HandleFunc("GET /api/v1/projects", routes.HandleListProjects(s.deps.Store, s.deps.Log))
+ // Model management (control center)
+ s.mux.HandleFunc("GET /api/v1/models", routes.HandleListModels(s.deps.ModelManager, s.deps.Log))
+ s.mux.HandleFunc("GET /api/v1/models/active", routes.HandleActiveModel(s.deps.ModelManager, s.deps.Log))
+ s.mux.HandleFunc("POST /api/v1/models/active", routes.HandleSwapModel(s.deps.ModelManager, s.deps.Log))
+
// LLM usage monitoring
s.mux.HandleFunc("GET /api/v1/llm/usage", routes.HandleLLMUsage(s.deps.Store, s.deps.Log))
diff --git a/internal/llm/embedded.go b/internal/llm/embedded.go
index 82d8fa2a..c3b06e6a 100644
--- a/internal/llm/embedded.go
+++ b/internal/llm/embedded.go
@@ -56,6 +56,26 @@ type BackendCompletionResponse struct {
MinProb float32 // minimum probability of any chosen token (0-1)
}
+// AvailableModel describes a GGUF model available for loading.
+type AvailableModel struct {
+ Filename string `json:"filename"`
+ Path string `json:"path"`
+ SizeMB int64 `json:"size_mb"`
+ Role string `json:"role,omitempty"` // "chat" or "embedding"
+ Version string `json:"version,omitempty"` // model version
+ Quantize string `json:"quantize,omitempty"` // quantization type
+}
+
+// ModelStatus reports the currently loaded model state.
+type ModelStatus struct {
+ ChatModel string `json:"chat_model"`
+ EmbedModel string `json:"embed_model"`
+ Loaded bool `json:"loaded"`
+ ModelsDir string `json:"models_dir"`
+ Mode string `json:"mode,omitempty"` // "embedded" or "api"
+ APIModel string `json:"api_model,omitempty"` // cloud model name when in API mode
+}
+
// EmbeddedProvider implements the Provider interface using in-process inference
// via a Backend (llama.cpp CGo bindings). This allows mnemonic to run its own
// GGUF models without an external API server.
@@ -67,10 +87,11 @@ type EmbeddedProvider struct {
maxTokens int
temperature float32
- mu sync.RWMutex
- chatBackend Backend
- embedBackend Backend
- sem chan struct{}
+ mu sync.RWMutex
+ chatBackend Backend
+ embedBackend Backend
+ sem chan struct{}
+ backendFactory func() Backend
}
// EmbeddedProviderConfig holds the configuration for creating an EmbeddedProvider.
@@ -117,10 +138,13 @@ func NewEmbeddedProvider(cfg EmbeddedProviderConfig) *EmbeddedProvider {
// LoadModels loads the configured GGUF model files using the given backend factory.
// backendFactory creates a new Backend instance for each model.
+// The factory is retained for later hot-swap operations.
func (p *EmbeddedProvider) LoadModels(backendFactory func() Backend) error {
p.mu.Lock()
defer p.mu.Unlock()
+ p.backendFactory = backendFactory
+
// Load chat model
chatPath := filepath.Join(p.modelsDir, p.chatModelFile)
if _, err := os.Stat(chatPath); err != nil {
@@ -167,18 +191,26 @@ func (p *EmbeddedProvider) release() {
<-p.sem
}
-// formatPrompt converts a slice of Messages into a single prompt string.
-// Uses the Felix-LM fine-tuning format: <|system|>\n...\n<|user|>\n...\n<|assistant|>\n
+// formatPrompt converts a slice of Messages into a prompt string.
+// Uses ChatML format (Qwen 3.5, Gemma-it, etc.):
+//
+// <|im_start|>system\n...<|im_end|>\n<|im_start|>user\n...<|im_end|>\n<|im_start|>assistant\n
+//
+// Appends /no_think to the system message to disable Qwen's thinking mode,
+// which interferes with GBNF grammar-constrained generation.
func formatPrompt(messages []Message) string {
var b strings.Builder
for _, msg := range messages {
- b.WriteString("<|")
+ b.WriteString("<|im_start|>")
b.WriteString(msg.Role)
- b.WriteString("|>\n")
- b.WriteString(msg.Content)
b.WriteByte('\n')
+ b.WriteString(msg.Content)
+ if msg.Role == "system" {
+ b.WriteString(" /no_think")
+ }
+ b.WriteString("<|im_end|>\n")
}
- b.WriteString("<|assistant|>\n")
+ b.WriteString("<|im_start|>assistant\n")
return b.String()
}
@@ -233,12 +265,25 @@ func (p *EmbeddedProvider) Complete(ctx context.Context, req CompletionRequest)
"prompt_chars", len(prompt))
}
+ // Ensure <|im_end|> is a stop sequence so the model stops at turn boundary.
+ stop := req.Stop
+ hasIMEnd := false
+ for _, s := range stop {
+ if s == "<|im_end|>" {
+ hasIMEnd = true
+ break
+ }
+ }
+ if !hasIMEnd {
+ stop = append(stop, "<|im_end|>")
+ }
+
backendReq := BackendCompletionRequest{
Prompt: prompt,
MaxTokens: maxTokens,
Temperature: temp,
TopP: req.TopP,
- Stop: req.Stop,
+ Stop: stop,
Grammar: grammar,
}
@@ -247,8 +292,11 @@ func (p *EmbeddedProvider) Complete(ctx context.Context, req CompletionRequest)
return CompletionResponse{}, fmt.Errorf("embedded completion: %w", err)
}
+ // Strip Qwen-style ... wrapper if present.
+ content := stripThinkingTokens(backendResp.Text)
+
return CompletionResponse{
- Content: backendResp.Text,
+ Content: content,
StopReason: "stop",
TokensUsed: backendResp.PromptTokens + backendResp.CompletionTokens,
PromptTokens: backendResp.PromptTokens,
@@ -334,6 +382,211 @@ func (p *EmbeddedProvider) ModelInfo(ctx context.Context) (ModelMetadata, error)
}, nil
}
+// ListAvailableModels returns models registered in models.json.
+// Only curated, production-ready models appear — not every GGUF file on disk.
+func (p *EmbeddedProvider) ListAvailableModels() ([]AvailableModel, error) {
+ p.mu.RLock()
+ dir := p.modelsDir
+ p.mu.RUnlock()
+
+ manifest, err := LoadManifest(dir)
+ if err != nil {
+ return nil, fmt.Errorf("loading model manifest: %w", err)
+ }
+
+ var models []AvailableModel
+ for _, entry := range manifest.Models {
+ models = append(models, AvailableModel{
+ Filename: entry.Filename,
+ Path: filepath.Join(dir, entry.Filename),
+ SizeMB: entry.SizeBytes / (1024 * 1024),
+ Role: entry.Role,
+ Version: entry.Version,
+ Quantize: entry.Quantize,
+ })
+ }
+ return models, nil
+}
+
+// ActiveModel returns the currently loaded model status.
+func (p *EmbeddedProvider) ActiveModel() ModelStatus {
+ p.mu.RLock()
+ defer p.mu.RUnlock()
+ return ModelStatus{
+ ChatModel: p.chatModelFile,
+ EmbedModel: p.embedModelFile,
+ Loaded: p.chatBackend != nil,
+ ModelsDir: p.modelsDir,
+ }
+}
+
+// SwapChatModel hot-swaps the chat model to a different GGUF file.
+// The old backend is closed after the new one is loaded successfully.
+func (p *EmbeddedProvider) SwapChatModel(filename string) error {
+ p.mu.RLock()
+ factory := p.backendFactory
+ dir := p.modelsDir
+ opts := p.opts
+ p.mu.RUnlock()
+
+ if factory == nil {
+ return fmt.Errorf("no backend factory configured — cannot swap models")
+ }
+
+ modelPath := filepath.Join(dir, filename)
+ if _, err := os.Stat(modelPath); err != nil {
+ return fmt.Errorf("model not found at %s: %w", modelPath, err)
+ }
+
+ // Load new model before acquiring write lock
+ newBackend := factory()
+ if err := newBackend.LoadModel(modelPath, opts); err != nil {
+ return fmt.Errorf("loading new chat model %s: %w", filename, err)
+ }
+ slog.Info("loaded new chat model for swap", "path", modelPath)
+
+ // Swap under write lock
+ p.mu.Lock()
+ oldBackend := p.chatBackend
+ p.chatBackend = newBackend
+ p.chatModelFile = filename
+ p.mu.Unlock()
+
+ // Close old backend outside the lock
+ if oldBackend != nil {
+ if err := oldBackend.Close(); err != nil {
+ slog.Warn("error closing old chat backend during swap", "error", err)
+ }
+ }
+
+ slog.Info("chat model swapped", "model", filename)
+ return nil
+}
+
+// SwapEmbedModel hot-swaps the embedding model to a different GGUF file.
+// Pass empty string to clear the dedicated embedding model (falls back to chat backend).
+func (p *EmbeddedProvider) SwapEmbedModel(filename string) error {
+ if filename == "" {
+ p.mu.Lock()
+ oldBackend := p.embedBackend
+ p.embedBackend = nil
+ p.embedModelFile = ""
+ p.mu.Unlock()
+ if oldBackend != nil {
+ if err := oldBackend.Close(); err != nil {
+ slog.Warn("error closing old embed backend during swap", "error", err)
+ }
+ }
+ slog.Info("embed model cleared, using chat backend for embeddings")
+ return nil
+ }
+
+ p.mu.RLock()
+ factory := p.backendFactory
+ dir := p.modelsDir
+ opts := p.opts
+ p.mu.RUnlock()
+
+ if factory == nil {
+ return fmt.Errorf("no backend factory configured — cannot swap models")
+ }
+
+ modelPath := filepath.Join(dir, filename)
+ if _, err := os.Stat(modelPath); err != nil {
+ return fmt.Errorf("embed model not found at %s: %w", modelPath, err)
+ }
+
+ newBackend := factory()
+ if err := newBackend.LoadModel(modelPath, opts); err != nil {
+ return fmt.Errorf("loading new embed model %s: %w", filename, err)
+ }
+
+ p.mu.Lock()
+ oldBackend := p.embedBackend
+ p.embedBackend = newBackend
+ p.embedModelFile = filename
+ p.mu.Unlock()
+
+ if oldBackend != nil {
+ if err := oldBackend.Close(); err != nil {
+ slog.Warn("error closing old embed backend during swap", "error", err)
+ }
+ }
+
+ slog.Info("embed model swapped", "model", filename)
+ return nil
+}
+
+// Unload releases all backend resources without destroying the provider config.
+// The provider can be reloaded later with Reload().
+func (p *EmbeddedProvider) Unload() {
+ p.mu.Lock()
+ defer p.mu.Unlock()
+
+ if p.chatBackend != nil {
+ if err := p.chatBackend.Close(); err != nil {
+ slog.Warn("error closing chat backend during unload", "error", err)
+ }
+ p.chatBackend = nil
+ slog.Info("unloaded chat model", "model", p.chatModelFile)
+ }
+ if p.embedBackend != nil {
+ if err := p.embedBackend.Close(); err != nil {
+ slog.Warn("error closing embed backend during unload", "error", err)
+ }
+ p.embedBackend = nil
+ slog.Info("unloaded embed model", "model", p.embedModelFile)
+ }
+}
+
+// Reload reloads models using the stored backend factory.
+// Called after Unload() to restore embedded inference.
+func (p *EmbeddedProvider) Reload() error {
+ p.mu.RLock()
+ factory := p.backendFactory
+ p.mu.RUnlock()
+
+ if factory == nil {
+ return fmt.Errorf("no backend factory configured — cannot reload")
+ }
+ return p.LoadModels(factory)
+}
+
+// SetProviderMode on a bare EmbeddedProvider only supports "embedded".
+func (p *EmbeddedProvider) SetProviderMode(mode string) error {
+ if mode == "embedded" {
+ return nil
+ }
+ return fmt.Errorf("API provider not configured — only embedded mode available")
+}
+
+// ProviderMode always returns "embedded" for a bare EmbeddedProvider.
+func (p *EmbeddedProvider) ProviderMode() string {
+ return "embedded"
+}
+
+// stripThinkingTokens removes ... blocks from model output.
+// Qwen 3.5 and similar models prepend reasoning tokens before the actual response.
+func stripThinkingTokens(text string) string {
+ const openTag = ""
+ const closeTag = ""
+
+ for {
+ start := strings.Index(text, openTag)
+ if start == -1 {
+ break
+ }
+ end := strings.Index(text[start:], closeTag)
+ if end == -1 {
+ // Unclosed think tag — strip from start to end of text
+ text = strings.TrimSpace(text[:start])
+ break
+ }
+ text = text[:start] + text[start+end+len(closeTag):]
+ }
+ return strings.TrimSpace(text)
+}
+
// Close releases all backend resources.
func (p *EmbeddedProvider) Close() error {
p.mu.Lock()
diff --git a/internal/llm/provider.go b/internal/llm/provider.go
index 90da6a0e..df49f7b8 100644
--- a/internal/llm/provider.go
+++ b/internal/llm/provider.go
@@ -111,6 +111,28 @@ type Provider interface {
ModelInfo(ctx context.Context) (ModelMetadata, error)
}
+// ModelManager is the interface for runtime model management.
+// Implemented by SwitchableProvider (embedded + API) or EmbeddedProvider alone.
+type ModelManager interface {
+ // ListAvailableModels returns models from the manifest.
+ ListAvailableModels() ([]AvailableModel, error)
+
+ // ActiveModel returns the currently loaded model status.
+ ActiveModel() ModelStatus
+
+ // SwapChatModel hot-swaps the chat model to a different GGUF file.
+ SwapChatModel(filename string) error
+
+ // SwapEmbedModel hot-swaps the embedding model. Empty string clears it.
+ SwapEmbedModel(filename string) error
+
+ // SetProviderMode switches between "embedded" and "api" at runtime.
+ SetProviderMode(mode string) error
+
+ // ProviderMode returns the current mode ("embedded" or "api").
+ ProviderMode() string
+}
+
// ErrProviderUnavailable is returned when the LLM backend is not reachable.
type ErrProviderUnavailable struct {
Endpoint string
diff --git a/internal/llm/switchable.go b/internal/llm/switchable.go
new file mode 100644
index 00000000..4ce76456
--- /dev/null
+++ b/internal/llm/switchable.go
@@ -0,0 +1,150 @@
+package llm
+
+import (
+ "context"
+ "fmt"
+ "sync"
+)
+
+// SwitchableProvider wraps an embedded provider and an API provider,
+// allowing runtime switching between local inference and cloud API.
+// All agents hold references to this provider, so switching takes effect
+// immediately across the entire daemon.
+type SwitchableProvider struct {
+ mu sync.RWMutex
+ embedded *EmbeddedProvider
+ api Provider
+ useAPI bool
+ apiModel string // model name for display (e.g. "gemini-3-flash-preview")
+}
+
+// NewSwitchableProvider creates a provider that can toggle between embedded and API.
+// Starts in embedded mode.
+func NewSwitchableProvider(embedded *EmbeddedProvider, api Provider, apiModel string) *SwitchableProvider {
+ return &SwitchableProvider{
+ embedded: embedded,
+ api: api,
+ apiModel: apiModel,
+ }
+}
+
+func (s *SwitchableProvider) active() Provider {
+ if s.useAPI {
+ return s.api
+ }
+ return s.embedded
+}
+
+// Complete delegates to the active provider.
+func (s *SwitchableProvider) Complete(ctx context.Context, req CompletionRequest) (CompletionResponse, error) {
+ s.mu.RLock()
+ p := s.active()
+ s.mu.RUnlock()
+ return p.Complete(ctx, req)
+}
+
+// Embed delegates to the active provider.
+func (s *SwitchableProvider) Embed(ctx context.Context, text string) ([]float32, error) {
+ s.mu.RLock()
+ p := s.active()
+ s.mu.RUnlock()
+ return p.Embed(ctx, text)
+}
+
+// BatchEmbed delegates to the active provider.
+func (s *SwitchableProvider) BatchEmbed(ctx context.Context, texts []string) ([][]float32, error) {
+ s.mu.RLock()
+ p := s.active()
+ s.mu.RUnlock()
+ return p.BatchEmbed(ctx, texts)
+}
+
+// Health delegates to the active provider.
+func (s *SwitchableProvider) Health(ctx context.Context) error {
+ s.mu.RLock()
+ p := s.active()
+ s.mu.RUnlock()
+ return p.Health(ctx)
+}
+
+// ModelInfo delegates to the active provider.
+func (s *SwitchableProvider) ModelInfo(ctx context.Context) (ModelMetadata, error) {
+ s.mu.RLock()
+ p := s.active()
+ s.mu.RUnlock()
+ return p.ModelInfo(ctx)
+}
+
+// --- ModelManager implementation ---
+
+// ListAvailableModels delegates to the embedded provider.
+func (s *SwitchableProvider) ListAvailableModels() ([]AvailableModel, error) {
+ return s.embedded.ListAvailableModels()
+}
+
+// ActiveModel returns the current model status including provider mode.
+func (s *SwitchableProvider) ActiveModel() ModelStatus {
+ s.mu.RLock()
+ defer s.mu.RUnlock()
+ status := s.embedded.ActiveModel()
+ if s.useAPI {
+ status.Mode = "api"
+ status.APIModel = s.apiModel
+ } else {
+ status.Mode = "embedded"
+ }
+ return status
+}
+
+// SwapChatModel delegates to the embedded provider.
+func (s *SwitchableProvider) SwapChatModel(filename string) error {
+ return s.embedded.SwapChatModel(filename)
+}
+
+// SwapEmbedModel delegates to the embedded provider.
+func (s *SwitchableProvider) SwapEmbedModel(filename string) error {
+ return s.embedded.SwapEmbedModel(filename)
+}
+
+// SetProviderMode switches between "embedded" and "api" at runtime.
+// Switching to API unloads embedded models to free VRAM.
+// Switching back to embedded reloads them.
+func (s *SwitchableProvider) SetProviderMode(mode string) error {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ switch mode {
+ case "embedded":
+ if !s.useAPI {
+ return nil // already in embedded mode
+ }
+ // Reload models
+ if err := s.embedded.Reload(); err != nil {
+ return fmt.Errorf("reloading embedded models: %w", err)
+ }
+ s.useAPI = false
+ case "api":
+ if s.api == nil {
+ return fmt.Errorf("API provider not configured")
+ }
+ if s.useAPI {
+ return nil // already in API mode
+ }
+ s.useAPI = true
+ // Unload models to free VRAM
+ s.embedded.Unload()
+ default:
+ return fmt.Errorf("unknown provider mode: %q (use \"embedded\" or \"api\")", mode)
+ }
+ return nil
+}
+
+// ProviderMode returns the current mode ("embedded" or "api").
+func (s *SwitchableProvider) ProviderMode() string {
+ s.mu.RLock()
+ defer s.mu.RUnlock()
+ if s.useAPI {
+ return "api"
+ }
+ return "embedded"
+}
diff --git a/internal/web/static/index.html b/internal/web/static/index.html
index 53310092..b5c84823 100644
--- a/internal/web/static/index.html
+++ b/internal/web/static/index.html
@@ -1109,6 +1109,7 @@
+
@@ -1554,6 +1555,47 @@
What do you remember?
+
+
+
+
+
+
+
+
+
+
+
+
Available Models
+
+ | Model | Role | Size | Status | Action |
+ | Loading... |
+
+
+
+
+
+
+
diff --git a/internal/web/static/js/app.js b/internal/web/static/js/app.js
index 5bf84ca7..a5fbd909 100644
--- a/internal/web/static/js/app.js
+++ b/internal/web/static/js/app.js
@@ -28,6 +28,7 @@ import { loadAgentData, refreshAgentData, renderAgentDashboard, sendChatMessage,
toggleChatHistory, startNewConversation, onModelChange,
checkForUpdate, triggerUpdate, renderMarkdown } from './agent.js';
import { formatBytes } from './llm.js';
+import { loadModels, swapChatModel, swapEmbedModel, switchProviderMode } from './models.js';
// ── Wire to window for HTML onclick handlers ──
Object.assign(window, {
@@ -58,6 +59,8 @@ Object.assign(window, {
loadAgentData, refreshAgentData, renderAgentDashboard, sendChatMessage,
toggleChatHistory, startNewConversation, onModelChange,
checkForUpdate, triggerUpdate,
+ // Models
+ loadModels, swapChatModel, swapEmbedModel, switchProviderMode,
// Utils (used by some inline HTML)
escapeHtml, showToast, relativeTime, simpleMarkdown, toggleToolDetail,
agentProfile, memoryType, safeSalience, renderMarkdown, formatBytes,
diff --git a/internal/web/static/js/models.js b/internal/web/static/js/models.js
new file mode 100644
index 00000000..3d96f455
--- /dev/null
+++ b/internal/web/static/js/models.js
@@ -0,0 +1,161 @@
+import { state } from './state.js';
+import { fetchJSON, escapeHtml } from './utils.js';
+
+var _swapLog = [];
+
+function appendSwapLog(msg) {
+ _swapLog.push('[' + new Date().toLocaleTimeString() + '] ' + msg);
+ var el = document.getElementById('modelSwapLog');
+ var panel = document.getElementById('modelSwapStatus');
+ if (el && panel) {
+ panel.style.display = '';
+ el.textContent = _swapLog.join('\n');
+ el.scrollTop = el.scrollHeight;
+ }
+}
+
+export async function loadModels() {
+ try {
+ var data = await fetchJSON('/models');
+ state.modelsLoaded = true;
+
+ if (!data.enabled) {
+ document.getElementById('modelChatName').textContent = 'N/A';
+ document.getElementById('modelEmbedName').textContent = 'N/A';
+ document.getElementById('modelStatus').textContent = 'Not available';
+ document.getElementById('modelStatus').style.color = 'var(--text-dim)';
+ document.getElementById('modelDir').textContent = '-';
+ document.getElementById('modelModeToggle').innerHTML = '';
+ document.getElementById('modelsTableBody').innerHTML = '| Embedded provider not active. Set llm.provider: "embedded" in config.yaml and rebuild with make build-embedded |
';
+ document.getElementById('modelsUpdated').textContent = 'Updated ' + new Date().toLocaleTimeString();
+ return;
+ }
+
+ var active = data.active || {};
+ var mode = data.mode || active.mode || 'embedded';
+ var isAPI = mode === 'api';
+
+ // Mode toggle button
+ var toggleEl = document.getElementById('modelModeToggle');
+ if (isAPI) {
+ document.getElementById('modelChatName').textContent = active.api_model || 'Gemini';
+ document.getElementById('modelEmbedName').textContent = 'API';
+ document.getElementById('modelStatus').textContent = 'Cloud API';
+ document.getElementById('modelStatus').style.color = 'var(--accent-cyan)';
+ toggleEl.innerHTML = '';
+ } else {
+ document.getElementById('modelChatName').textContent = active.chat_model || 'none';
+ document.getElementById('modelEmbedName').textContent = active.embed_model || '(using chat model)';
+ if (active.loaded) {
+ document.getElementById('modelStatus').textContent = 'Loaded';
+ document.getElementById('modelStatus').style.color = 'var(--accent-green)';
+ } else {
+ document.getElementById('modelStatus').textContent = 'Not loaded';
+ document.getElementById('modelStatus').style.color = 'var(--accent-red)';
+ }
+ toggleEl.innerHTML = '';
+ }
+ document.getElementById('modelDir').textContent = active.models_dir || '-';
+
+ var models = data.models || [];
+ var tbody = document.getElementById('modelsTableBody');
+
+ if (models.length === 0) {
+ tbody.innerHTML = '| No models in models.json. |
';
+ } else {
+ tbody.innerHTML = models.map(function(m) {
+ var isChatActive = !isAPI && m.filename === active.chat_model;
+ var isEmbedActive = !isAPI && m.filename === active.embed_model;
+ var status = '';
+ if (isChatActive) status = 'active';
+ else if (isEmbedActive) status = 'active';
+ else if (isAPI) status = 'standby';
+
+ var roleLabel = m.role || '-';
+ var detail = [m.quantize, m.version].filter(Boolean).join(' / ');
+
+ var actions = '';
+ if (!isAPI && m.role === 'chat' && !isChatActive) {
+ actions += '';
+ }
+ if (!isAPI && m.role === 'embedding' && !isEmbedActive) {
+ actions += '';
+ }
+ if (isChatActive || isEmbedActive) actions = '-';
+
+ return '' +
+ '' + escapeHtml(m.filename) + '' + (detail ? ' ' + escapeHtml(detail) + '' : '') + ' | ' +
+ '' + roleLabel + ' | ' +
+ '' + m.size_mb + ' MB | ' +
+ '' + status + ' | ' +
+ '' + actions + ' | ' +
+ '
';
+ }).join('');
+ }
+
+ document.getElementById('modelsUpdated').textContent = 'Updated ' + new Date().toLocaleTimeString();
+ } catch (e) {
+ document.getElementById('modelsTableBody').innerHTML = '| Error: ' + escapeHtml(e.message) + ' |
';
+ }
+}
+
+export async function switchProviderMode(mode) {
+ var label = mode === 'api' ? 'Gemini (API)' : 'Embedded (local)';
+ appendSwapLog('Switching to ' + label + '...');
+ try {
+ var resp = await fetch('/api/v1/models/active', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({ mode: mode })
+ });
+ var data = await resp.json();
+ if (!resp.ok) {
+ appendSwapLog('ERROR: ' + (data.error || 'unknown error'));
+ return;
+ }
+ appendSwapLog('Switched to ' + label);
+ loadModels();
+ } catch (e) {
+ appendSwapLog('ERROR: ' + e.message);
+ }
+}
+
+export async function swapChatModel(filename) {
+ appendSwapLog('Swapping chat model to ' + filename + '...');
+ try {
+ var resp = await fetch('/api/v1/models/active', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({ chat_model: filename })
+ });
+ var data = await resp.json();
+ if (!resp.ok) {
+ appendSwapLog('ERROR: ' + (data.error || 'unknown error'));
+ return;
+ }
+ appendSwapLog('Chat model swapped to ' + filename);
+ loadModels();
+ } catch (e) {
+ appendSwapLog('ERROR: ' + e.message);
+ }
+}
+
+export async function swapEmbedModel(filename) {
+ appendSwapLog('Swapping embed model to ' + filename + '...');
+ try {
+ var resp = await fetch('/api/v1/models/active', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({ embed_model: filename })
+ });
+ var data = await resp.json();
+ if (!resp.ok) {
+ appendSwapLog('ERROR: ' + (data.error || 'unknown error'));
+ return;
+ }
+ appendSwapLog('Embed model swapped to ' + filename);
+ loadModels();
+ } catch (e) {
+ appendSwapLog('ERROR: ' + e.message);
+ }
+}
diff --git a/internal/web/static/js/nav.js b/internal/web/static/js/nav.js
index 5eb5c143..3000674f 100644
--- a/internal/web/static/js/nav.js
+++ b/internal/web/static/js/nav.js
@@ -18,7 +18,7 @@ export function switchView(name) {
if (tab) tab.classList.add('active');
window.location.hash = name;
// Update breadcrumbs
- var crumbMap = { recall: 'Search', explore: 'Forum', timeline: 'Timeline', agent: 'SDK', llm: 'LLM Usage', tools: 'Tools' };
+ var crumbMap = { recall: 'Search', explore: 'Forum', timeline: 'Timeline', agent: 'SDK', llm: 'LLM Usage', tools: 'Tools', models: 'Models' };
var bc = document.getElementById('breadcrumbs');
if (bc && crumbMap[name]) bc.innerHTML = 'mnemonic › ' + crumbMap[name];
if (name === 'explore') {
@@ -30,6 +30,7 @@ export function switchView(name) {
if (name === 'agent' && !state.agentLoaded) window.loadAgentData();
if (name === 'llm' && !state.llmLoaded) window.loadLLMUsage();
if (name === 'tools' && !state.toolsLoaded) window.loadToolUsage();
+ if (name === 'models' && !state.modelsLoaded) window.loadModels();
}
export function switchExploreTab(tab) {
@@ -67,7 +68,7 @@ export function handleHash() {
if (catId) window.loadForumCategory(catId, catId);
return;
}
- if (['recall', 'explore', 'timeline', 'agent', 'llm', 'tools'].includes(hash)) switchView(hash);
+ if (['recall', 'explore', 'timeline', 'agent', 'llm', 'tools', 'models'].includes(hash)) switchView(hash);
}
window.addEventListener('hashchange', handleHash);
@@ -84,6 +85,7 @@ document.addEventListener('keydown', function(e) {
case '4': switchView('agent'); break;
case '5': switchView('llm'); break;
case '6': switchView('tools'); break;
+ case '7': switchView('models'); break;
}
});
diff --git a/training/scripts/export_qwen35_spokes.py b/training/scripts/export_qwen35_spokes.py
index 646cfe6b..1e46b015 100644
--- a/training/scripts/export_qwen35_spokes.py
+++ b/training/scripts/export_qwen35_spokes.py
@@ -154,6 +154,37 @@ def main():
for layer_idx in norm_layers:
spoke_tensors[f"blk.{layer_idx}.spoke.norm.weight"] = torch.ones(d_model, dtype=torch.float32)
+ # Build fused spoke matrices for fewer GPU kernel launches
+ # w_down_fused = cat([w_down[0], ..., w_down[n-1]], dim=0) -> (rank*n_spokes, d_model)
+ # w_up_fused = cat([w_up[0], ..., w_up[n-1]], dim=1) -> (d_model, rank*n_spokes)
+ n_spokes = spoke_config.num_spokes
+ fused_count = 0
+ for layer_idx in sorted(norm_layers):
+ w_downs = []
+ w_ups = []
+ for s in range(n_spokes):
+ down_key = f"blk.{layer_idx}.spoke.w_down.{s}.weight"
+ up_key = f"blk.{layer_idx}.spoke.w_up.{s}.weight"
+ if down_key in spoke_tensors and up_key in spoke_tensors:
+ w_downs.append(spoke_tensors[down_key])
+ w_ups.append(spoke_tensors[up_key])
+
+ if len(w_downs) == n_spokes:
+ # w_down[s] shape: (rank, d_model) -> concat along dim 0 -> (rank*n_spokes, d_model)
+ w_down_fused = torch.cat(w_downs, dim=0)
+ # w_up[s] shape: (d_model, rank) -> concat along dim 1 -> (d_model, rank*n_spokes)
+ w_up_fused = torch.cat(w_ups, dim=1)
+ spoke_tensors[f"blk.{layer_idx}.spoke.w_down_fused.weight"] = w_down_fused
+ spoke_tensors[f"blk.{layer_idx}.spoke.w_up_fused.weight"] = w_up_fused
+ # Remove individual spoke tensors (fused replaces them)
+ for s in range(n_spokes):
+ spoke_tensors.pop(f"blk.{layer_idx}.spoke.w_down.{s}.weight", None)
+ spoke_tensors.pop(f"blk.{layer_idx}.spoke.w_up.{s}.weight", None)
+ fused_count += 1
+
+ if fused_count > 0:
+ print(f" Created {fused_count} fused spoke matrix pairs (2 matmuls/layer instead of {2 * n_spokes})")
+
print(f" Prepared {len(spoke_tensors)} spoke tensors ({len(norm_layers)} layers)")
# --- Phase 3: Copy base GGUF and patch with spokes ---
diff --git a/training/scripts/quantize_rq4.py b/training/scripts/quantize_rq4.py
index c9c6ad30..18dbac8f 100644
--- a/training/scripts/quantize_rq4.py
+++ b/training/scripts/quantize_rq4.py
@@ -171,9 +171,13 @@ def main():
# Skip only norms, biases, embeddings, and individual (non-fused) spoke matrices.
is_individual_spoke = ("spoke" in t.name and "fused" not in t.name
and ("w_down" in t.name or "w_up" in t.name))
+ # Inner dim (ne[0] in GGUF) must be >= QK_RQ4 to form valid blocks.
+ # Qwen 3.5 hybrid has ssm_conv1d with ne[0]=4 which can't be quantized.
+ inner_dim_ok = int(t.shape[0]) >= QK_RQ4
should_quantize = (
len(t.shape) == 2
and t.n_elements >= args.min_elements
+ and inner_dim_ok
and not any(p in t.name for p in skip_patterns)
and not is_individual_spoke
)