diff --git a/pkg/llmproxy/benchmarks/client.go b/pkg/llmproxy/benchmarks/client.go new file mode 100644 index 0000000000..381190ae2f --- /dev/null +++ b/pkg/llmproxy/benchmarks/client.go @@ -0,0 +1,173 @@ +// Package benchmarks provides integration with tokenledger for dynamic benchmark data. +// +// This enables cliproxy++ to use real-time benchmark data from: +// - Artificial Analysis API (intelligence, speed, latency) +// - OpenRouter API (pricing, context) +// - CLIProxyAPI metrics (runtime performance) +package benchmarks + +import ( + "encoding/json" + "fmt" + "sync" + "time" +) + +// BenchmarkData represents benchmark data for a model +type BenchmarkData struct { + ModelID string `json:"model_id"` + Provider string `json:"provider,omitempty"` + IntelligenceIndex *float64 `json:"intelligence_index,omitempty"` + CodingIndex *float64 `json:"coding_index,omitempty"` + SpeedTPS *float64 `json:"speed_tps,omitempty"` + LatencyTTFTMs *float64 `json:"latency_ttft_ms,omitempty"` + InputPricePer1M *float64 `json:"price_input_per_1m,omitempty"` + OutputPricePer1M *float64 `json:"price_output_per_1m,omitempty"` + ContextWindow *int64 `json:"context_window_tokens,omitempty"` + Confidence float64 `json:"confidence"` + Source string `json:"source"` +} + +// Client for fetching benchmarks from tokenledger +type Client struct { + tokenledgerPath string + cache map[string]*cacheEntry + cacheMu sync.RWMutex + cacheTTL time.Duration +} + +type cacheEntry struct { + data BenchmarkData + expires time.Time +} + +// NewClient creates a new benchmark client +func NewClient(tokenledgerPath string) *Client { + return &Client{ + tokenledgerPath: tokenledgerPath, + cache: make(map[string]*cacheEntry), + cacheTTL: time.Hour, + } +} + +// GetBenchmark returns benchmark data for a model, with caching +func (c *Client) GetBenchmark(modelID string) (*BenchmarkData, error) { + // Check cache first + c.cacheMu.RLock() + if entry, ok := c.cache[modelID]; ok && time.Now().Before(entry.expires) { + c.cacheMu.RUnlock() + return &entry.data, nil + } + c.cacheMu.RUnlock() + + // Fetch fresh data + data, err := c.fetchFromTokenledger(modelID) + if err != nil { + // Return cached expired data if fetch fails + c.cacheMu.RLock() + if entry, ok := c.cache[modelID]; ok { + c.cacheMu.RUnlock() + return &entry.data, nil + } + c.cacheMu.RUnlock() + return nil, err + } + + // Update cache + c.cacheMu.Lock() + c.cache[modelID] = &cacheEntry{ + data: *data, + expires: time.Now().Add(c.cacheTTL), + } + c.cacheMu.Unlock() + + return data, nil +} + +// fetchFromTokenledger calls the tokenledger CLI to get benchmark data +func (c *Client) fetchFromTokenledger(modelID string) (*BenchmarkData, error) { + // Call tokenledger CLI (would be implemented in Rust binary) + // For now, return nil to use fallback hardcoded values + return nil, fmt.Errorf("tokenledger not configured") +} + +// GetAllBenchmarks returns all available benchmark data +func (c *Client) GetAllBenchmarks() ([]BenchmarkData, error) { + // This would call tokenledger to get all benchmarks + return nil, fmt.Errorf("tokenledger not configured") +} + +// RefreshBenchmarks forces a refresh of benchmark data +func (c *Client) RefreshBenchmarks() error { + // Clear cache + c.cacheMu.Lock() + c.cache = make(map[string]*cacheEntry) + c.cacheMu.Unlock() + + // Would trigger tokenledger to fetch fresh data + return nil +} + +// GetQualityScore returns the quality score for a model +func (c *Client) GetQualityScore(modelID string) (float64, bool) { + data, err := c.GetBenchmark(modelID) + if err != nil || data == nil || data.IntelligenceIndex == nil { + return 0, false + } + return *data.IntelligenceIndex / 100.0, true // Normalize to 0-1 +} + +// GetCost returns the cost per 1K tokens for a model +func (c *Client) GetCost(modelID string) (float64, bool) { + data, err := c.GetBenchmark(modelID) + if err != nil || data == nil || data.InputPricePer1M == nil { + return 0, false + } + return *data.InputPricePer1M, true +} + +// GetLatency returns the latency in ms for a model +func (c *Client) GetLatency(modelID string) (int, bool) { + data, err := c.GetBenchmark(modelID) + if err != nil || data == nil || data.LatencyTTFTMs == nil { + return 0, false + } + return int(*data.LatencyTTFTMs), true +} + +// BenchmarkProvider defines interface for benchmark sources +type BenchmarkProvider interface { + GetBenchmark(modelID string) (*BenchmarkData, error) + GetAllBenchmarks() ([]BenchmarkData, error) + Refresh() error +} + +// MockProvider provides hardcoded fallback data +type MockProvider struct{} + +// NewMockProvider creates a provider with fallback data +func NewMockProvider() *MockProvider { + return &MockProvider{} +} + +func (p *MockProvider) GetBenchmark(modelID string) (*BenchmarkData, error) { + return nil, fmt.Errorf("not implemented") +} + +func (p *MockProvider) GetAllBenchmarks() ([]BenchmarkData, error) { + return nil, fmt.Errorf("not implemented") +} + +func (p *MockProvider) Refresh() error { + return nil +} + +// JSON marshaling support +func (b *BenchmarkData) MarshalJSON() ([]byte, error) { + type Alias BenchmarkData + return json.Marshal(&struct { + *Alias + }{ + Alias: (*Alias)(b), + }) +} diff --git a/pkg/llmproxy/benchmarks/unified.go b/pkg/llmproxy/benchmarks/unified.go new file mode 100644 index 0000000000..0785ecf50c --- /dev/null +++ b/pkg/llmproxy/benchmarks/unified.go @@ -0,0 +1,182 @@ +// Package benchmarks provides unified benchmark access with fallback to hardcoded values. +// This integrates with tokenledger for dynamic data while maintaining backward compatibility. +package benchmarks + +import ( + "fmt" + "sync" +) + +// UnifiedBenchmarkStore combines dynamic tokenledger data with hardcoded fallbacks +type UnifiedBenchmarkStore struct { + primary BenchmarkProvider + fallbacks *FallbackProvider + mu sync.RWMutex +} + +// FallbackProvider provides hardcoded benchmark values +type FallbackProvider struct { + // qualityProxy maps known model IDs to their quality scores in [0,1] + QualityProxy map[string]float64 + // CostPer1kProxy maps model IDs to estimated cost per 1k tokens (USD) + CostPer1kProxy map[string]float64 + // LatencyMsProxy maps model IDs to estimated p50 latency in milliseconds + LatencyMsProxy map[string]int +} + +// DefaultFallbackProvider returns the hardcoded maps from pareto_router.go +func DefaultFallbackProvider() *FallbackProvider { + return &FallbackProvider{ + QualityProxy: map[string]float64{ + "claude-opus-4.6": 0.95, + "claude-opus-4.6-1m": 0.96, + "claude-sonnet-4.6": 0.88, + "claude-haiku-4.5": 0.75, + "gpt-5.3-codex-high": 0.92, + "gpt-5.3-codex": 0.82, + "claude-4.5-opus-high-thinking": 0.94, + "claude-4.5-opus-high": 0.92, + "claude-4.5-sonnet-thinking": 0.85, + "claude-4-sonnet": 0.80, + "gpt-4o": 0.85, + "gpt-5.1-codex": 0.80, + "gemini-3-flash": 0.78, + "gemini-3.1-pro": 0.90, + "gemini-2.5-flash": 0.76, + "gemini-2.0-flash": 0.72, + "glm-5": 0.78, + "minimax-m2.5": 0.75, + "deepseek-v3.2": 0.80, + "composer-1.5": 0.82, + "composer-1": 0.78, + "roo-default": 0.70, + "kilo-default": 0.70, + }, + CostPer1kProxy: map[string]float64{ + "claude-opus-4.6": 0.015, + "claude-opus-4.6-1m": 0.015, + "claude-sonnet-4.6": 0.003, + "claude-haiku-4.5": 0.00025, + "gpt-5.3-codex-high": 0.020, + "gpt-5.3-codex": 0.010, + "claude-4.5-opus-high-thinking": 0.025, + "claude-4.5-opus-high": 0.015, + "claude-4.5-sonnet-thinking": 0.005, + "claude-4-sonnet": 0.003, + "gpt-4o": 0.005, + "gpt-5.1-codex": 0.008, + "gemini-3-flash": 0.00015, + "gemini-3.1-pro": 0.007, + "gemini-2.5-flash": 0.0001, + "gemini-2.0-flash": 0.0001, + "glm-5": 0.001, + "minimax-m2.5": 0.001, + "deepseek-v3.2": 0.0005, + "composer-1.5": 0.002, + "composer-1": 0.001, + "roo-default": 0.0, + "kilo-default": 0.0, + }, + LatencyMsProxy: map[string]int{ + "claude-opus-4.6": 4000, + "claude-opus-4.6-1m": 5000, + "claude-sonnet-4.6": 2000, + "claude-haiku-4.5": 800, + "gpt-5.3-codex-high": 6000, + "gpt-5.3-codex": 3000, + "claude-4.5-opus-high-thinking": 8000, + "claude-4.5-opus-high": 5000, + "claude-4.5-sonnet-thinking": 4000, + "claude-4-sonnet": 2500, + "gpt-4o": 2000, + "gpt-5.1-codex": 3000, + "gemini-3-flash": 600, + "gemini-3.1-pro": 3000, + "gemini-2.5-flash": 500, + "gemini-2.0-flash": 400, + "glm-5": 1500, + "minimax-m2.5": 1200, + "deepseek-v3.2": 1000, + "composer-1.5": 2000, + "composer-1": 1500, + "roo-default": 1000, + "kilo-default": 1000, + }, + } +} + +// NewUnifiedStore creates a store with primary and fallback providers +func NewUnifiedStore(primary BenchmarkProvider) *UnifiedBenchmarkStore { + return &UnifiedBenchmarkStore{ + primary: primary, + fallbacks: DefaultFallbackProvider(), + } +} + +// NewFallbackOnlyStore creates a store with only hardcoded fallbacks +func NewFallbackOnlyStore() *UnifiedBenchmarkStore { + return &UnifiedBenchmarkStore{ + primary: nil, + fallbacks: DefaultFallbackProvider(), + } +} + +// GetQuality returns quality score, trying primary first then fallback +func (s *UnifiedBenchmarkStore) GetQuality(modelID string) (float64, bool) { + // Try primary (tokenledger) first + if s.primary != nil { + if data, err := s.primary.GetBenchmark(modelID); err == nil && data != nil && data.IntelligenceIndex != nil { + return *data.IntelligenceIndex / 100.0, true + } + } + + // Fallback to hardcoded + if q, ok := s.fallbacks.QualityProxy[modelID]; ok { + return q, true + } + return 0, false +} + +// GetCost returns cost per 1K tokens, trying primary then fallback +func (s *UnifiedBenchmarkStore) GetCost(modelID string) (float64, bool) { + if s.primary != nil { + if data, err := s.primary.GetBenchmark(modelID); err == nil && data != nil && data.InputPricePer1M != nil { + return *data.InputPricePer1M, true + } + } + + if c, ok := s.fallbacks.CostPer1kProxy[modelID]; ok { + return c, true + } + return 0, false +} + +// GetLatency returns latency in ms, trying primary then fallback +func (s *UnifiedBenchmarkStore) GetLatency(modelID string) (int, bool) { + if s.primary != nil { + if data, err := s.primary.GetBenchmark(modelID); err == nil && data != nil && data.LatencyTTFTMs != nil { + return int(*data.LatencyTTFTMs), true + } + } + + if l, ok := s.fallbacks.LatencyMsProxy[modelID]; ok { + return l, true + } + return 0, false +} + +// GetAll returns all benchmark data from primary +func (s *UnifiedBenchmarkStore) GetAll() ([]BenchmarkData, error) { + if s.primary == nil { + return nil, fmt.Errorf("no primary provider configured") + } + return s.primary.GetAllBenchmarks() +} + +// Refresh triggers a refresh of benchmark data +func (s *UnifiedBenchmarkStore) Refresh() error { + if s.primary != nil { + return s.primary.Refresh() + } + return nil +} diff --git a/pkg/llmproxy/registry/pareto_router.go b/pkg/llmproxy/registry/pareto_router.go index 21620da3d3..fe0a178547 100644 --- a/pkg/llmproxy/registry/pareto_router.go +++ b/pkg/llmproxy/registry/pareto_router.go @@ -14,6 +14,8 @@ import ( "fmt" "math" "strings" + + "github.com/router-for-me/CLIProxyAPI/v6/pkg/llmproxy/benchmarks" ) // qualityProxy maps known model IDs to their quality scores in [0,1]. @@ -127,17 +129,36 @@ func inferProvider(modelID string) string { } // ParetoRouter selects the Pareto-optimal model for a given RoutingRequest. -type ParetoRouter struct{} +type ParetoRouter struct { + // benchmarkStore provides dynamic benchmark data with fallback + benchmarkStore *benchmarks.UnifiedBenchmarkStore +} -// NewParetoRouter returns a new ParetoRouter. +// NewParetoRouter returns a new ParetoRouter with benchmarks integration. func NewParetoRouter() *ParetoRouter { - return &ParetoRouter{} + return &ParetoRouter{ + benchmarkStore: benchmarks.NewFallbackOnlyStore(), + } +} + +// NewParetoRouterWithBenchmarks returns a ParetoRouter with dynamic benchmarks. +// Pass nil for primary to use fallback-only mode. +func NewParetoRouterWithBenchmarks(primary benchmarks.BenchmarkProvider) *ParetoRouter { + var store *benchmarks.UnifiedBenchmarkStore + if primary != nil { + store = benchmarks.NewUnifiedStore(primary) + } else { + store = benchmarks.NewFallbackOnlyStore() + } + return &ParetoRouter{ + benchmarkStore: store, + } } // SelectModel applies hard constraints, builds the Pareto frontier, and returns // the best candidate by quality/cost ratio. func (p *ParetoRouter) SelectModel(_ context.Context, req *RoutingRequest) (*RoutingCandidate, error) { - allCandidates := buildCandidates(req) + allCandidates := p.buildCandidates(req) feasible := filterByConstraints(allCandidates, req) if len(feasible) == 0 { @@ -149,18 +170,43 @@ func (p *ParetoRouter) SelectModel(_ context.Context, req *RoutingRequest) (*Rou return selectFromCandidates(frontier), nil } -// buildCandidates constructs RoutingCandidates from the quality/cost proxy tables. -// Estimated cost is scaled from per-1k-tokens to per-call assuming ~1000 tokens avg. -func buildCandidates(_ *RoutingRequest) []*RoutingCandidate { +// buildCandidates constructs RoutingCandidates from benchmark store. +// Falls back to hardcoded maps if benchmark store unavailable. +func (p *ParetoRouter) buildCandidates(req *RoutingRequest) []*RoutingCandidate { candidates := make([]*RoutingCandidate, 0, len(qualityProxy)) + for modelID, quality := range qualityProxy { - costPer1k := costPer1kProxy[modelID] - // Estimate per-call cost at 1000 token average. - estimatedCost := costPer1k * 1.0 - latencyMs, ok := latencyMsProxy[modelID] - if !ok { - latencyMs = 2000 + // Try dynamic benchmarks first, fallback to hardcoded + var costPer1k float64 + var latencyMs int + var ok bool + + if p.benchmarkStore != nil { + if c, found := p.benchmarkStore.GetCost(modelID); found { + costPer1k = c + } else { + costPer1k = costPer1kProxy[modelID] + } + if l, found := p.benchmarkStore.GetLatency(modelID); found { + latencyMs = l + } else { + latencyMs, ok = latencyMsProxy[modelID] + if !ok { + latencyMs = 2000 + } + } + } else { + // Fallback to hardcoded maps + costPer1k = costPer1kProxy[modelID] + var ok bool + latencyMs, ok = latencyMsProxy[modelID] + if !ok { + latencyMs = 2000 + } } + + estimatedCost := costPer1k * 1.0 // Scale to per-call + candidates = append(candidates, &RoutingCandidate{ ModelID: modelID, Provider: inferProvider(modelID),