diff --git a/cmd/mnemonic/runtime.go b/cmd/mnemonic/runtime.go index b65332d2..63434667 100644 --- a/cmd/mnemonic/runtime.go +++ b/cmd/mnemonic/runtime.go @@ -187,9 +187,26 @@ func buildEncodingConfig(cfg *config.Config) encoding.EncodingConfig { } } +// newAPIProvider creates an API-based LLM provider from config. +func newAPIProvider(cfg *config.Config) llm.Provider { + timeout := time.Duration(cfg.LLM.TimeoutSec) * time.Second + if timeout == 0 { + timeout = 30 * time.Second + } + return llm.NewLMStudioProvider( + cfg.LLM.Endpoint, + cfg.LLM.ChatModel, + cfg.LLM.EmbeddingModel, + cfg.LLM.APIKey, + timeout, + cfg.LLM.MaxConcurrent, + ) +} + // newLLMProvider creates the appropriate LLM provider based on config. // For "api" (default), it creates an LMStudioProvider for OpenAI-compatible APIs. -// For "embedded", it creates an EmbeddedProvider for in-process llama.cpp inference. +// For "embedded", it creates a SwitchableProvider with embedded as primary +// and API as a fallback that can be toggled at runtime. func newLLMProvider(cfg *config.Config) llm.Provider { switch cfg.LLM.Provider { case "embedded": @@ -215,19 +232,15 @@ func newLLMProvider(cfg *config.Config) llm.Provider { } else { slog.Warn("embedded provider selected but llama.cpp not compiled in (build with: make build-embedded)") } - return ep - default: // "api" or "" - timeout := time.Duration(cfg.LLM.TimeoutSec) * time.Second - if timeout == 0 { - timeout = 30 * time.Second + + // Create API provider as runtime fallback (Gemini, etc.) + var apiProvider llm.Provider + if cfg.LLM.Endpoint != "" { + apiProvider = newAPIProvider(cfg) } - return llm.NewLMStudioProvider( - cfg.LLM.Endpoint, - cfg.LLM.ChatModel, - cfg.LLM.EmbeddingModel, - cfg.LLM.APIKey, - timeout, - cfg.LLM.MaxConcurrent, - ) + + return llm.NewSwitchableProvider(ep, apiProvider, cfg.LLM.ChatModel) + default: // "api" or "" + return newAPIProvider(cfg) } } diff --git a/cmd/mnemonic/serve.go b/cmd/mnemonic/serve.go index 759a08ee..62335dc5 100644 --- a/cmd/mnemonic/serve.go +++ b/cmd/mnemonic/serve.go @@ -663,6 +663,12 @@ func serveCommand(configPath string) { StartTime: time.Now(), Log: log, } + // Wire model manager if using switchable/embedded provider + if sp, ok := llmProvider.(*llm.SwitchableProvider); ok { + apiDeps.ModelManager = sp + } else if ep, ok := llmProvider.(*llm.EmbeddedProvider); ok { + apiDeps.ModelManager = ep + } // Only set Consolidator if it's non-nil (avoids Go nil-interface trap) if consolidator != nil { apiDeps.Consolidator = consolidator diff --git a/internal/api/routes/models.go b/internal/api/routes/models.go new file mode 100644 index 00000000..4ea536d6 --- /dev/null +++ b/internal/api/routes/models.go @@ -0,0 +1,119 @@ +package routes + +import ( + "encoding/json" + "log/slog" + "net/http" + + "github.com/appsprout-dev/mnemonic/internal/llm" +) + +// HandleListModels returns available GGUF models in the models directory. +func HandleListModels(mgr llm.ModelManager, log *slog.Logger) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + if mgr == nil { + writeJSON(w, http.StatusOK, map[string]interface{}{ + "models": []interface{}{}, + "enabled": false, + "message": "embedded provider not active", + }) + return + } + + models, err := mgr.ListAvailableModels() + if err != nil { + log.Error("failed to list models", "error", err) + writeError(w, http.StatusInternalServerError, "failed to list models: "+err.Error(), "MODEL_ERROR") + return + } + + active := mgr.ActiveModel() + + writeJSON(w, http.StatusOK, map[string]interface{}{ + "models": models, + "active": active, + "enabled": true, + "mode": mgr.ProviderMode(), + }) + } +} + +// HandleActiveModel returns the currently loaded model status. +func HandleActiveModel(mgr llm.ModelManager, log *slog.Logger) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + if mgr == nil { + writeJSON(w, http.StatusOK, map[string]interface{}{ + "enabled": false, + "message": "embedded provider not active", + }) + return + } + + active := mgr.ActiveModel() + writeJSON(w, http.StatusOK, map[string]interface{}{ + "active": active, + "enabled": true, + }) + } +} + +// swapModelRequest is the JSON body for POST /api/v1/models/active. +type swapModelRequest struct { + ChatModel string `json:"chat_model"` + EmbedModel string `json:"embed_model"` + Mode string `json:"mode"` // "embedded" or "api" — switches provider +} + +// HandleSwapModel hot-swaps the active chat or embedding model. +func HandleSwapModel(mgr llm.ModelManager, log *slog.Logger) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + if mgr == nil { + writeError(w, http.StatusBadRequest, "embedded provider not active — model swap unavailable", "MODEL_ERROR") + return + } + + var req swapModelRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + writeError(w, http.StatusBadRequest, "invalid JSON body: "+err.Error(), "INVALID_PARAM") + return + } + + if req.ChatModel == "" && req.EmbedModel == "" && req.Mode == "" { + writeError(w, http.StatusBadRequest, "specify chat_model, embed_model, or mode", "INVALID_PARAM") + return + } + + if req.Mode != "" { + log.Info("switching provider mode", "mode", req.Mode) + if err := mgr.SetProviderMode(req.Mode); err != nil { + log.Error("failed to switch provider mode", "error", err) + writeError(w, http.StatusBadRequest, "failed to switch mode: "+err.Error(), "MODEL_ERROR") + return + } + } + + if req.ChatModel != "" { + log.Info("swapping chat model", "model", req.ChatModel) + if err := mgr.SwapChatModel(req.ChatModel); err != nil { + log.Error("failed to swap chat model", "error", err) + writeError(w, http.StatusInternalServerError, "failed to swap chat model: "+err.Error(), "MODEL_ERROR") + return + } + } + + if req.EmbedModel != "" { + log.Info("swapping embed model", "model", req.EmbedModel) + if err := mgr.SwapEmbedModel(req.EmbedModel); err != nil { + log.Error("failed to swap embed model", "error", err) + writeError(w, http.StatusInternalServerError, "failed to swap embed model: "+err.Error(), "MODEL_ERROR") + return + } + } + + active := mgr.ActiveModel() + writeJSON(w, http.StatusOK, map[string]interface{}{ + "status": "ok", + "active": active, + }) + } +} diff --git a/internal/api/server.go b/internal/api/server.go index 64bb97ce..9ea6528d 100644 --- a/internal/api/server.go +++ b/internal/api/server.go @@ -30,6 +30,7 @@ type ServerConfig struct { type ServerDeps struct { Store store.Store LLM llm.Provider + ModelManager llm.ModelManager // can be nil if not using embedded provider Bus events.Bus Retriever *retrieval.RetrievalAgent Consolidator routes.ConsolidationRunner // can be nil if disabled @@ -132,6 +133,11 @@ func (s *Server) registerRoutes() { s.mux.HandleFunc("GET /api/v1/abstractions", routes.HandleListAbstractions(s.deps.Store, s.deps.Log)) s.mux.HandleFunc("GET /api/v1/projects", routes.HandleListProjects(s.deps.Store, s.deps.Log)) + // Model management (control center) + s.mux.HandleFunc("GET /api/v1/models", routes.HandleListModels(s.deps.ModelManager, s.deps.Log)) + s.mux.HandleFunc("GET /api/v1/models/active", routes.HandleActiveModel(s.deps.ModelManager, s.deps.Log)) + s.mux.HandleFunc("POST /api/v1/models/active", routes.HandleSwapModel(s.deps.ModelManager, s.deps.Log)) + // LLM usage monitoring s.mux.HandleFunc("GET /api/v1/llm/usage", routes.HandleLLMUsage(s.deps.Store, s.deps.Log)) diff --git a/internal/llm/embedded.go b/internal/llm/embedded.go index 82d8fa2a..c3b06e6a 100644 --- a/internal/llm/embedded.go +++ b/internal/llm/embedded.go @@ -56,6 +56,26 @@ type BackendCompletionResponse struct { MinProb float32 // minimum probability of any chosen token (0-1) } +// AvailableModel describes a GGUF model available for loading. +type AvailableModel struct { + Filename string `json:"filename"` + Path string `json:"path"` + SizeMB int64 `json:"size_mb"` + Role string `json:"role,omitempty"` // "chat" or "embedding" + Version string `json:"version,omitempty"` // model version + Quantize string `json:"quantize,omitempty"` // quantization type +} + +// ModelStatus reports the currently loaded model state. +type ModelStatus struct { + ChatModel string `json:"chat_model"` + EmbedModel string `json:"embed_model"` + Loaded bool `json:"loaded"` + ModelsDir string `json:"models_dir"` + Mode string `json:"mode,omitempty"` // "embedded" or "api" + APIModel string `json:"api_model,omitempty"` // cloud model name when in API mode +} + // EmbeddedProvider implements the Provider interface using in-process inference // via a Backend (llama.cpp CGo bindings). This allows mnemonic to run its own // GGUF models without an external API server. @@ -67,10 +87,11 @@ type EmbeddedProvider struct { maxTokens int temperature float32 - mu sync.RWMutex - chatBackend Backend - embedBackend Backend - sem chan struct{} + mu sync.RWMutex + chatBackend Backend + embedBackend Backend + sem chan struct{} + backendFactory func() Backend } // EmbeddedProviderConfig holds the configuration for creating an EmbeddedProvider. @@ -117,10 +138,13 @@ func NewEmbeddedProvider(cfg EmbeddedProviderConfig) *EmbeddedProvider { // LoadModels loads the configured GGUF model files using the given backend factory. // backendFactory creates a new Backend instance for each model. +// The factory is retained for later hot-swap operations. func (p *EmbeddedProvider) LoadModels(backendFactory func() Backend) error { p.mu.Lock() defer p.mu.Unlock() + p.backendFactory = backendFactory + // Load chat model chatPath := filepath.Join(p.modelsDir, p.chatModelFile) if _, err := os.Stat(chatPath); err != nil { @@ -167,18 +191,26 @@ func (p *EmbeddedProvider) release() { <-p.sem } -// formatPrompt converts a slice of Messages into a single prompt string. -// Uses the Felix-LM fine-tuning format: <|system|>\n...\n<|user|>\n...\n<|assistant|>\n +// formatPrompt converts a slice of Messages into a prompt string. +// Uses ChatML format (Qwen 3.5, Gemma-it, etc.): +// +// <|im_start|>system\n...<|im_end|>\n<|im_start|>user\n...<|im_end|>\n<|im_start|>assistant\n +// +// Appends /no_think to the system message to disable Qwen's thinking mode, +// which interferes with GBNF grammar-constrained generation. func formatPrompt(messages []Message) string { var b strings.Builder for _, msg := range messages { - b.WriteString("<|") + b.WriteString("<|im_start|>") b.WriteString(msg.Role) - b.WriteString("|>\n") - b.WriteString(msg.Content) b.WriteByte('\n') + b.WriteString(msg.Content) + if msg.Role == "system" { + b.WriteString(" /no_think") + } + b.WriteString("<|im_end|>\n") } - b.WriteString("<|assistant|>\n") + b.WriteString("<|im_start|>assistant\n") return b.String() } @@ -233,12 +265,25 @@ func (p *EmbeddedProvider) Complete(ctx context.Context, req CompletionRequest) "prompt_chars", len(prompt)) } + // Ensure <|im_end|> is a stop sequence so the model stops at turn boundary. + stop := req.Stop + hasIMEnd := false + for _, s := range stop { + if s == "<|im_end|>" { + hasIMEnd = true + break + } + } + if !hasIMEnd { + stop = append(stop, "<|im_end|>") + } + backendReq := BackendCompletionRequest{ Prompt: prompt, MaxTokens: maxTokens, Temperature: temp, TopP: req.TopP, - Stop: req.Stop, + Stop: stop, Grammar: grammar, } @@ -247,8 +292,11 @@ func (p *EmbeddedProvider) Complete(ctx context.Context, req CompletionRequest) return CompletionResponse{}, fmt.Errorf("embedded completion: %w", err) } + // Strip Qwen-style ... wrapper if present. + content := stripThinkingTokens(backendResp.Text) + return CompletionResponse{ - Content: backendResp.Text, + Content: content, StopReason: "stop", TokensUsed: backendResp.PromptTokens + backendResp.CompletionTokens, PromptTokens: backendResp.PromptTokens, @@ -334,6 +382,211 @@ func (p *EmbeddedProvider) ModelInfo(ctx context.Context) (ModelMetadata, error) }, nil } +// ListAvailableModels returns models registered in models.json. +// Only curated, production-ready models appear — not every GGUF file on disk. +func (p *EmbeddedProvider) ListAvailableModels() ([]AvailableModel, error) { + p.mu.RLock() + dir := p.modelsDir + p.mu.RUnlock() + + manifest, err := LoadManifest(dir) + if err != nil { + return nil, fmt.Errorf("loading model manifest: %w", err) + } + + var models []AvailableModel + for _, entry := range manifest.Models { + models = append(models, AvailableModel{ + Filename: entry.Filename, + Path: filepath.Join(dir, entry.Filename), + SizeMB: entry.SizeBytes / (1024 * 1024), + Role: entry.Role, + Version: entry.Version, + Quantize: entry.Quantize, + }) + } + return models, nil +} + +// ActiveModel returns the currently loaded model status. +func (p *EmbeddedProvider) ActiveModel() ModelStatus { + p.mu.RLock() + defer p.mu.RUnlock() + return ModelStatus{ + ChatModel: p.chatModelFile, + EmbedModel: p.embedModelFile, + Loaded: p.chatBackend != nil, + ModelsDir: p.modelsDir, + } +} + +// SwapChatModel hot-swaps the chat model to a different GGUF file. +// The old backend is closed after the new one is loaded successfully. +func (p *EmbeddedProvider) SwapChatModel(filename string) error { + p.mu.RLock() + factory := p.backendFactory + dir := p.modelsDir + opts := p.opts + p.mu.RUnlock() + + if factory == nil { + return fmt.Errorf("no backend factory configured — cannot swap models") + } + + modelPath := filepath.Join(dir, filename) + if _, err := os.Stat(modelPath); err != nil { + return fmt.Errorf("model not found at %s: %w", modelPath, err) + } + + // Load new model before acquiring write lock + newBackend := factory() + if err := newBackend.LoadModel(modelPath, opts); err != nil { + return fmt.Errorf("loading new chat model %s: %w", filename, err) + } + slog.Info("loaded new chat model for swap", "path", modelPath) + + // Swap under write lock + p.mu.Lock() + oldBackend := p.chatBackend + p.chatBackend = newBackend + p.chatModelFile = filename + p.mu.Unlock() + + // Close old backend outside the lock + if oldBackend != nil { + if err := oldBackend.Close(); err != nil { + slog.Warn("error closing old chat backend during swap", "error", err) + } + } + + slog.Info("chat model swapped", "model", filename) + return nil +} + +// SwapEmbedModel hot-swaps the embedding model to a different GGUF file. +// Pass empty string to clear the dedicated embedding model (falls back to chat backend). +func (p *EmbeddedProvider) SwapEmbedModel(filename string) error { + if filename == "" { + p.mu.Lock() + oldBackend := p.embedBackend + p.embedBackend = nil + p.embedModelFile = "" + p.mu.Unlock() + if oldBackend != nil { + if err := oldBackend.Close(); err != nil { + slog.Warn("error closing old embed backend during swap", "error", err) + } + } + slog.Info("embed model cleared, using chat backend for embeddings") + return nil + } + + p.mu.RLock() + factory := p.backendFactory + dir := p.modelsDir + opts := p.opts + p.mu.RUnlock() + + if factory == nil { + return fmt.Errorf("no backend factory configured — cannot swap models") + } + + modelPath := filepath.Join(dir, filename) + if _, err := os.Stat(modelPath); err != nil { + return fmt.Errorf("embed model not found at %s: %w", modelPath, err) + } + + newBackend := factory() + if err := newBackend.LoadModel(modelPath, opts); err != nil { + return fmt.Errorf("loading new embed model %s: %w", filename, err) + } + + p.mu.Lock() + oldBackend := p.embedBackend + p.embedBackend = newBackend + p.embedModelFile = filename + p.mu.Unlock() + + if oldBackend != nil { + if err := oldBackend.Close(); err != nil { + slog.Warn("error closing old embed backend during swap", "error", err) + } + } + + slog.Info("embed model swapped", "model", filename) + return nil +} + +// Unload releases all backend resources without destroying the provider config. +// The provider can be reloaded later with Reload(). +func (p *EmbeddedProvider) Unload() { + p.mu.Lock() + defer p.mu.Unlock() + + if p.chatBackend != nil { + if err := p.chatBackend.Close(); err != nil { + slog.Warn("error closing chat backend during unload", "error", err) + } + p.chatBackend = nil + slog.Info("unloaded chat model", "model", p.chatModelFile) + } + if p.embedBackend != nil { + if err := p.embedBackend.Close(); err != nil { + slog.Warn("error closing embed backend during unload", "error", err) + } + p.embedBackend = nil + slog.Info("unloaded embed model", "model", p.embedModelFile) + } +} + +// Reload reloads models using the stored backend factory. +// Called after Unload() to restore embedded inference. +func (p *EmbeddedProvider) Reload() error { + p.mu.RLock() + factory := p.backendFactory + p.mu.RUnlock() + + if factory == nil { + return fmt.Errorf("no backend factory configured — cannot reload") + } + return p.LoadModels(factory) +} + +// SetProviderMode on a bare EmbeddedProvider only supports "embedded". +func (p *EmbeddedProvider) SetProviderMode(mode string) error { + if mode == "embedded" { + return nil + } + return fmt.Errorf("API provider not configured — only embedded mode available") +} + +// ProviderMode always returns "embedded" for a bare EmbeddedProvider. +func (p *EmbeddedProvider) ProviderMode() string { + return "embedded" +} + +// stripThinkingTokens removes ... blocks from model output. +// Qwen 3.5 and similar models prepend reasoning tokens before the actual response. +func stripThinkingTokens(text string) string { + const openTag = "" + const closeTag = "" + + for { + start := strings.Index(text, openTag) + if start == -1 { + break + } + end := strings.Index(text[start:], closeTag) + if end == -1 { + // Unclosed think tag — strip from start to end of text + text = strings.TrimSpace(text[:start]) + break + } + text = text[:start] + text[start+end+len(closeTag):] + } + return strings.TrimSpace(text) +} + // Close releases all backend resources. func (p *EmbeddedProvider) Close() error { p.mu.Lock() diff --git a/internal/llm/provider.go b/internal/llm/provider.go index 90da6a0e..df49f7b8 100644 --- a/internal/llm/provider.go +++ b/internal/llm/provider.go @@ -111,6 +111,28 @@ type Provider interface { ModelInfo(ctx context.Context) (ModelMetadata, error) } +// ModelManager is the interface for runtime model management. +// Implemented by SwitchableProvider (embedded + API) or EmbeddedProvider alone. +type ModelManager interface { + // ListAvailableModels returns models from the manifest. + ListAvailableModels() ([]AvailableModel, error) + + // ActiveModel returns the currently loaded model status. + ActiveModel() ModelStatus + + // SwapChatModel hot-swaps the chat model to a different GGUF file. + SwapChatModel(filename string) error + + // SwapEmbedModel hot-swaps the embedding model. Empty string clears it. + SwapEmbedModel(filename string) error + + // SetProviderMode switches between "embedded" and "api" at runtime. + SetProviderMode(mode string) error + + // ProviderMode returns the current mode ("embedded" or "api"). + ProviderMode() string +} + // ErrProviderUnavailable is returned when the LLM backend is not reachable. type ErrProviderUnavailable struct { Endpoint string diff --git a/internal/llm/switchable.go b/internal/llm/switchable.go new file mode 100644 index 00000000..4ce76456 --- /dev/null +++ b/internal/llm/switchable.go @@ -0,0 +1,150 @@ +package llm + +import ( + "context" + "fmt" + "sync" +) + +// SwitchableProvider wraps an embedded provider and an API provider, +// allowing runtime switching between local inference and cloud API. +// All agents hold references to this provider, so switching takes effect +// immediately across the entire daemon. +type SwitchableProvider struct { + mu sync.RWMutex + embedded *EmbeddedProvider + api Provider + useAPI bool + apiModel string // model name for display (e.g. "gemini-3-flash-preview") +} + +// NewSwitchableProvider creates a provider that can toggle between embedded and API. +// Starts in embedded mode. +func NewSwitchableProvider(embedded *EmbeddedProvider, api Provider, apiModel string) *SwitchableProvider { + return &SwitchableProvider{ + embedded: embedded, + api: api, + apiModel: apiModel, + } +} + +func (s *SwitchableProvider) active() Provider { + if s.useAPI { + return s.api + } + return s.embedded +} + +// Complete delegates to the active provider. +func (s *SwitchableProvider) Complete(ctx context.Context, req CompletionRequest) (CompletionResponse, error) { + s.mu.RLock() + p := s.active() + s.mu.RUnlock() + return p.Complete(ctx, req) +} + +// Embed delegates to the active provider. +func (s *SwitchableProvider) Embed(ctx context.Context, text string) ([]float32, error) { + s.mu.RLock() + p := s.active() + s.mu.RUnlock() + return p.Embed(ctx, text) +} + +// BatchEmbed delegates to the active provider. +func (s *SwitchableProvider) BatchEmbed(ctx context.Context, texts []string) ([][]float32, error) { + s.mu.RLock() + p := s.active() + s.mu.RUnlock() + return p.BatchEmbed(ctx, texts) +} + +// Health delegates to the active provider. +func (s *SwitchableProvider) Health(ctx context.Context) error { + s.mu.RLock() + p := s.active() + s.mu.RUnlock() + return p.Health(ctx) +} + +// ModelInfo delegates to the active provider. +func (s *SwitchableProvider) ModelInfo(ctx context.Context) (ModelMetadata, error) { + s.mu.RLock() + p := s.active() + s.mu.RUnlock() + return p.ModelInfo(ctx) +} + +// --- ModelManager implementation --- + +// ListAvailableModels delegates to the embedded provider. +func (s *SwitchableProvider) ListAvailableModels() ([]AvailableModel, error) { + return s.embedded.ListAvailableModels() +} + +// ActiveModel returns the current model status including provider mode. +func (s *SwitchableProvider) ActiveModel() ModelStatus { + s.mu.RLock() + defer s.mu.RUnlock() + status := s.embedded.ActiveModel() + if s.useAPI { + status.Mode = "api" + status.APIModel = s.apiModel + } else { + status.Mode = "embedded" + } + return status +} + +// SwapChatModel delegates to the embedded provider. +func (s *SwitchableProvider) SwapChatModel(filename string) error { + return s.embedded.SwapChatModel(filename) +} + +// SwapEmbedModel delegates to the embedded provider. +func (s *SwitchableProvider) SwapEmbedModel(filename string) error { + return s.embedded.SwapEmbedModel(filename) +} + +// SetProviderMode switches between "embedded" and "api" at runtime. +// Switching to API unloads embedded models to free VRAM. +// Switching back to embedded reloads them. +func (s *SwitchableProvider) SetProviderMode(mode string) error { + s.mu.Lock() + defer s.mu.Unlock() + + switch mode { + case "embedded": + if !s.useAPI { + return nil // already in embedded mode + } + // Reload models + if err := s.embedded.Reload(); err != nil { + return fmt.Errorf("reloading embedded models: %w", err) + } + s.useAPI = false + case "api": + if s.api == nil { + return fmt.Errorf("API provider not configured") + } + if s.useAPI { + return nil // already in API mode + } + s.useAPI = true + // Unload models to free VRAM + s.embedded.Unload() + default: + return fmt.Errorf("unknown provider mode: %q (use \"embedded\" or \"api\")", mode) + } + return nil +} + +// ProviderMode returns the current mode ("embedded" or "api"). +func (s *SwitchableProvider) ProviderMode() string { + s.mu.RLock() + defer s.mu.RUnlock() + if s.useAPI { + return "api" + } + return "embedded" +} diff --git a/internal/web/static/index.html b/internal/web/static/index.html index 53310092..b5c84823 100644 --- a/internal/web/static/index.html +++ b/internal/web/static/index.html @@ -1109,6 +1109,7 @@ + + + +
+
+
+
Model Control Center
+
Embedded GGUF model management
+
+ + +
+
+ + +
+
Chat Model
-
+
Embed Model
-
+
Status
-
+
Models Dir
-
+
Provider
+
+ + +
+
Available Models
+ + + +
ModelRoleSizeStatusAction
Loading...
+
+ + + +
+
diff --git a/internal/web/static/js/app.js b/internal/web/static/js/app.js index 5bf84ca7..a5fbd909 100644 --- a/internal/web/static/js/app.js +++ b/internal/web/static/js/app.js @@ -28,6 +28,7 @@ import { loadAgentData, refreshAgentData, renderAgentDashboard, sendChatMessage, toggleChatHistory, startNewConversation, onModelChange, checkForUpdate, triggerUpdate, renderMarkdown } from './agent.js'; import { formatBytes } from './llm.js'; +import { loadModels, swapChatModel, swapEmbedModel, switchProviderMode } from './models.js'; // ── Wire to window for HTML onclick handlers ── Object.assign(window, { @@ -58,6 +59,8 @@ Object.assign(window, { loadAgentData, refreshAgentData, renderAgentDashboard, sendChatMessage, toggleChatHistory, startNewConversation, onModelChange, checkForUpdate, triggerUpdate, + // Models + loadModels, swapChatModel, swapEmbedModel, switchProviderMode, // Utils (used by some inline HTML) escapeHtml, showToast, relativeTime, simpleMarkdown, toggleToolDetail, agentProfile, memoryType, safeSalience, renderMarkdown, formatBytes, diff --git a/internal/web/static/js/models.js b/internal/web/static/js/models.js new file mode 100644 index 00000000..3d96f455 --- /dev/null +++ b/internal/web/static/js/models.js @@ -0,0 +1,161 @@ +import { state } from './state.js'; +import { fetchJSON, escapeHtml } from './utils.js'; + +var _swapLog = []; + +function appendSwapLog(msg) { + _swapLog.push('[' + new Date().toLocaleTimeString() + '] ' + msg); + var el = document.getElementById('modelSwapLog'); + var panel = document.getElementById('modelSwapStatus'); + if (el && panel) { + panel.style.display = ''; + el.textContent = _swapLog.join('\n'); + el.scrollTop = el.scrollHeight; + } +} + +export async function loadModels() { + try { + var data = await fetchJSON('/models'); + state.modelsLoaded = true; + + if (!data.enabled) { + document.getElementById('modelChatName').textContent = 'N/A'; + document.getElementById('modelEmbedName').textContent = 'N/A'; + document.getElementById('modelStatus').textContent = 'Not available'; + document.getElementById('modelStatus').style.color = 'var(--text-dim)'; + document.getElementById('modelDir').textContent = '-'; + document.getElementById('modelModeToggle').innerHTML = ''; + document.getElementById('modelsTableBody').innerHTML = 'Embedded provider not active. Set llm.provider: "embedded" in config.yaml and rebuild with make build-embedded'; + document.getElementById('modelsUpdated').textContent = 'Updated ' + new Date().toLocaleTimeString(); + return; + } + + var active = data.active || {}; + var mode = data.mode || active.mode || 'embedded'; + var isAPI = mode === 'api'; + + // Mode toggle button + var toggleEl = document.getElementById('modelModeToggle'); + if (isAPI) { + document.getElementById('modelChatName').textContent = active.api_model || 'Gemini'; + document.getElementById('modelEmbedName').textContent = 'API'; + document.getElementById('modelStatus').textContent = 'Cloud API'; + document.getElementById('modelStatus').style.color = 'var(--accent-cyan)'; + toggleEl.innerHTML = ''; + } else { + document.getElementById('modelChatName').textContent = active.chat_model || 'none'; + document.getElementById('modelEmbedName').textContent = active.embed_model || '(using chat model)'; + if (active.loaded) { + document.getElementById('modelStatus').textContent = 'Loaded'; + document.getElementById('modelStatus').style.color = 'var(--accent-green)'; + } else { + document.getElementById('modelStatus').textContent = 'Not loaded'; + document.getElementById('modelStatus').style.color = 'var(--accent-red)'; + } + toggleEl.innerHTML = ''; + } + document.getElementById('modelDir').textContent = active.models_dir || '-'; + + var models = data.models || []; + var tbody = document.getElementById('modelsTableBody'); + + if (models.length === 0) { + tbody.innerHTML = 'No models in models.json.'; + } else { + tbody.innerHTML = models.map(function(m) { + var isChatActive = !isAPI && m.filename === active.chat_model; + var isEmbedActive = !isAPI && m.filename === active.embed_model; + var status = ''; + if (isChatActive) status = 'active'; + else if (isEmbedActive) status = 'active'; + else if (isAPI) status = 'standby'; + + var roleLabel = m.role || '-'; + var detail = [m.quantize, m.version].filter(Boolean).join(' / '); + + var actions = ''; + if (!isAPI && m.role === 'chat' && !isChatActive) { + actions += ''; + } + if (!isAPI && m.role === 'embedding' && !isEmbedActive) { + actions += ''; + } + if (isChatActive || isEmbedActive) actions = '-'; + + return '' + + '' + escapeHtml(m.filename) + '' + (detail ? '
' + escapeHtml(detail) + '' : '') + '' + + '' + roleLabel + '' + + '' + m.size_mb + ' MB' + + '' + status + '' + + '' + actions + '' + + ''; + }).join(''); + } + + document.getElementById('modelsUpdated').textContent = 'Updated ' + new Date().toLocaleTimeString(); + } catch (e) { + document.getElementById('modelsTableBody').innerHTML = 'Error: ' + escapeHtml(e.message) + ''; + } +} + +export async function switchProviderMode(mode) { + var label = mode === 'api' ? 'Gemini (API)' : 'Embedded (local)'; + appendSwapLog('Switching to ' + label + '...'); + try { + var resp = await fetch('/api/v1/models/active', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ mode: mode }) + }); + var data = await resp.json(); + if (!resp.ok) { + appendSwapLog('ERROR: ' + (data.error || 'unknown error')); + return; + } + appendSwapLog('Switched to ' + label); + loadModels(); + } catch (e) { + appendSwapLog('ERROR: ' + e.message); + } +} + +export async function swapChatModel(filename) { + appendSwapLog('Swapping chat model to ' + filename + '...'); + try { + var resp = await fetch('/api/v1/models/active', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ chat_model: filename }) + }); + var data = await resp.json(); + if (!resp.ok) { + appendSwapLog('ERROR: ' + (data.error || 'unknown error')); + return; + } + appendSwapLog('Chat model swapped to ' + filename); + loadModels(); + } catch (e) { + appendSwapLog('ERROR: ' + e.message); + } +} + +export async function swapEmbedModel(filename) { + appendSwapLog('Swapping embed model to ' + filename + '...'); + try { + var resp = await fetch('/api/v1/models/active', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ embed_model: filename }) + }); + var data = await resp.json(); + if (!resp.ok) { + appendSwapLog('ERROR: ' + (data.error || 'unknown error')); + return; + } + appendSwapLog('Embed model swapped to ' + filename); + loadModels(); + } catch (e) { + appendSwapLog('ERROR: ' + e.message); + } +} diff --git a/internal/web/static/js/nav.js b/internal/web/static/js/nav.js index 5eb5c143..3000674f 100644 --- a/internal/web/static/js/nav.js +++ b/internal/web/static/js/nav.js @@ -18,7 +18,7 @@ export function switchView(name) { if (tab) tab.classList.add('active'); window.location.hash = name; // Update breadcrumbs - var crumbMap = { recall: 'Search', explore: 'Forum', timeline: 'Timeline', agent: 'SDK', llm: 'LLM Usage', tools: 'Tools' }; + var crumbMap = { recall: 'Search', explore: 'Forum', timeline: 'Timeline', agent: 'SDK', llm: 'LLM Usage', tools: 'Tools', models: 'Models' }; var bc = document.getElementById('breadcrumbs'); if (bc && crumbMap[name]) bc.innerHTML = 'mnemonic' + crumbMap[name]; if (name === 'explore') { @@ -30,6 +30,7 @@ export function switchView(name) { if (name === 'agent' && !state.agentLoaded) window.loadAgentData(); if (name === 'llm' && !state.llmLoaded) window.loadLLMUsage(); if (name === 'tools' && !state.toolsLoaded) window.loadToolUsage(); + if (name === 'models' && !state.modelsLoaded) window.loadModels(); } export function switchExploreTab(tab) { @@ -67,7 +68,7 @@ export function handleHash() { if (catId) window.loadForumCategory(catId, catId); return; } - if (['recall', 'explore', 'timeline', 'agent', 'llm', 'tools'].includes(hash)) switchView(hash); + if (['recall', 'explore', 'timeline', 'agent', 'llm', 'tools', 'models'].includes(hash)) switchView(hash); } window.addEventListener('hashchange', handleHash); @@ -84,6 +85,7 @@ document.addEventListener('keydown', function(e) { case '4': switchView('agent'); break; case '5': switchView('llm'); break; case '6': switchView('tools'); break; + case '7': switchView('models'); break; } }); diff --git a/training/scripts/export_qwen35_spokes.py b/training/scripts/export_qwen35_spokes.py index 646cfe6b..1e46b015 100644 --- a/training/scripts/export_qwen35_spokes.py +++ b/training/scripts/export_qwen35_spokes.py @@ -154,6 +154,37 @@ def main(): for layer_idx in norm_layers: spoke_tensors[f"blk.{layer_idx}.spoke.norm.weight"] = torch.ones(d_model, dtype=torch.float32) + # Build fused spoke matrices for fewer GPU kernel launches + # w_down_fused = cat([w_down[0], ..., w_down[n-1]], dim=0) -> (rank*n_spokes, d_model) + # w_up_fused = cat([w_up[0], ..., w_up[n-1]], dim=1) -> (d_model, rank*n_spokes) + n_spokes = spoke_config.num_spokes + fused_count = 0 + for layer_idx in sorted(norm_layers): + w_downs = [] + w_ups = [] + for s in range(n_spokes): + down_key = f"blk.{layer_idx}.spoke.w_down.{s}.weight" + up_key = f"blk.{layer_idx}.spoke.w_up.{s}.weight" + if down_key in spoke_tensors and up_key in spoke_tensors: + w_downs.append(spoke_tensors[down_key]) + w_ups.append(spoke_tensors[up_key]) + + if len(w_downs) == n_spokes: + # w_down[s] shape: (rank, d_model) -> concat along dim 0 -> (rank*n_spokes, d_model) + w_down_fused = torch.cat(w_downs, dim=0) + # w_up[s] shape: (d_model, rank) -> concat along dim 1 -> (d_model, rank*n_spokes) + w_up_fused = torch.cat(w_ups, dim=1) + spoke_tensors[f"blk.{layer_idx}.spoke.w_down_fused.weight"] = w_down_fused + spoke_tensors[f"blk.{layer_idx}.spoke.w_up_fused.weight"] = w_up_fused + # Remove individual spoke tensors (fused replaces them) + for s in range(n_spokes): + spoke_tensors.pop(f"blk.{layer_idx}.spoke.w_down.{s}.weight", None) + spoke_tensors.pop(f"blk.{layer_idx}.spoke.w_up.{s}.weight", None) + fused_count += 1 + + if fused_count > 0: + print(f" Created {fused_count} fused spoke matrix pairs (2 matmuls/layer instead of {2 * n_spokes})") + print(f" Prepared {len(spoke_tensors)} spoke tensors ({len(norm_layers)} layers)") # --- Phase 3: Copy base GGUF and patch with spokes --- diff --git a/training/scripts/quantize_rq4.py b/training/scripts/quantize_rq4.py index c9c6ad30..18dbac8f 100644 --- a/training/scripts/quantize_rq4.py +++ b/training/scripts/quantize_rq4.py @@ -171,9 +171,13 @@ def main(): # Skip only norms, biases, embeddings, and individual (non-fused) spoke matrices. is_individual_spoke = ("spoke" in t.name and "fused" not in t.name and ("w_down" in t.name or "w_up" in t.name)) + # Inner dim (ne[0] in GGUF) must be >= QK_RQ4 to form valid blocks. + # Qwen 3.5 hybrid has ssm_conv1d with ne[0]=4 which can't be quantized. + inner_dim_ok = int(t.shape[0]) >= QK_RQ4 should_quantize = ( len(t.shape) == 2 and t.n_elements >= args.min_elements + and inner_dim_ok and not any(p in t.name for p in skip_patterns) and not is_individual_spoke )