EM-GeekLab · pescn · Jan 28, 2026 · Jan 25, 2026 · Jan 25, 2026 · Jan 25, 2026
diff --git a/.gitignore b/.gitignore
@@ -29,3 +29,4 @@ python_test_code/
 !python_test_code/pyproject.toml
 !python_test_code/uv.lock
 !python_test_code/test_unified_suite.py
+!python_test_code/test_metrics.py
diff --git a/backend/src/api/metrics.ts b/backend/src/api/metrics.ts
@@ -0,0 +1,47 @@
+import { Elysia } from "elysia";
+import { generatePrometheusMetrics } from "@/services/prometheus";
+
+/**
+ * Prometheus metrics endpoint
+ * Exposes operational metrics in Prometheus exposition format
+ *
+ * SECURITY NOTE: This endpoint is intentionally public (no authentication required).
+ * This is a deliberate design choice because:
+ *
+ * 1. Standard Practice: Prometheus metrics endpoints are typically unauthenticated
+ *    to allow easy scraping by monitoring systems.
+ *
+ * 2. Operational Data Only: The metrics expose only aggregated operational data
+ *    (request counts, latencies, token usage, error rates). No sensitive data
+ *    like API keys, request/response content, or user data is exposed.
+ *
+ * 3. API Key Privacy: The `api_key_comment` label is used instead of the actual
+ *    API key value, providing meaningful aggregation without exposing secrets.
+ *
+ * 4. Network Security: In production deployments, network-level security (firewall
+ *    rules, VPC, ingress policies) should restrict access to the metrics endpoint
+ *    to authorized monitoring systems only.
+ *
+ * If stricter security is required, consider:
+ * - Using network policies to restrict access to Prometheus scrapers
+ * - Deploying a metrics proxy with authentication
+ * - Adding optional bearer token authentication via environment variable
+ */
+export const metricsApi = new Elysia().get(
+  "/metrics",
+  async () => {
+    const metrics = await generatePrometheusMetrics();
+    return new Response(metrics, {
+      headers: {
+        "Content-Type": "text/plain; version=0.0.4; charset=utf-8",
+      },
+    });
+  },
+  {
+    detail: {
+      description:
+        "Prometheus metrics endpoint. Returns operational metrics in Prometheus exposition format.",
+      tags: ["Metrics"],
+    },
+  },
+);
diff --git a/backend/src/db/index.ts b/backend/src/db/index.ts
@@ -1347,3 +1347,181 @@ export async function updateCompletion(
   const [first] = r;
   return first ?? null;
 }
+
+// ============================================
+// Prometheus Metrics Operations
+// ============================================
+
+/**
+ * Get completion metrics grouped by model, status, and api_format
+ * Returns all-time totals for Prometheus counters
+ * Joins with api_keys table to get api_key_comment for meaningful aggregation
+ */
+export async function getCompletionMetricsByModelAndStatus() {
+  logger.debug("getCompletionMetricsByModelAndStatus");
+  const result = await db.execute(sql`
+    SELECT
+      c.model,
+      c.status,
+      c.api_format,
+      COALESCE(ak.comment, 'unknown') AS api_key_comment,
+      COUNT(*) AS count,
+      COALESCE(SUM(CASE WHEN c.prompt_tokens > 0 THEN c.prompt_tokens ELSE 0 END), 0) AS prompt_tokens,
+      COALESCE(SUM(CASE WHEN c.completion_tokens > 0 THEN c.completion_tokens ELSE 0 END), 0) AS completion_tokens
+    FROM completions c
+    LEFT JOIN api_keys ak ON c.api_key_id = ak.id
+    WHERE c.deleted = false
+    GROUP BY c.model, c.status, c.api_format, ak.comment
+  `);
+  return result as unknown as {
+    model: string;
+    status: string;
+    api_format: string | null;
+    api_key_comment: string;
+    count: string;
+    prompt_tokens: string;
+    completion_tokens: string;
+  }[];
+}
+
+/**
+ * Get embedding metrics grouped by model and status
+ * Returns all-time totals for Prometheus counters
+ * Joins with api_keys table to get api_key_comment for meaningful aggregation
+ */
+export async function getEmbeddingMetricsByModelAndStatus() {
+  logger.debug("getEmbeddingMetricsByModelAndStatus");
+  const result = await db.execute(sql`
+    SELECT
+      e.model,
+      e.status,
+      COALESCE(ak.comment, 'unknown') AS api_key_comment,
+      COUNT(*) AS count,
+      COALESCE(SUM(CASE WHEN e.input_tokens > 0 THEN e.input_tokens ELSE 0 END), 0) AS input_tokens
+    FROM embeddings e
+    LEFT JOIN api_keys ak ON e.api_key_id = ak.id
+    WHERE e.deleted = false
+    GROUP BY e.model, e.status, ak.comment
+  `);
+  return result as unknown as {
+    model: string;
+    status: string;
+    api_key_comment: string;
+    count: string;
+    input_tokens: string;
+  }[];
+}
+
+// Histogram bucket boundaries in milliseconds (for LLM latency)
+export const LATENCY_BUCKETS_MS = [100, 250, 500, 1000, 2500, 5000, 10000, 30000, 60000, 120000];
+
+// Pre-computed bucket case SQL fragments (constant, computed once at module load)
+const DURATION_BUCKET_CASES = LATENCY_BUCKETS_MS.map(
+  (b) => `SUM(CASE WHEN duration <= ${b} THEN 1 ELSE 0 END) AS bucket_${b}`,
+).join(",\n      ");
+
+const TTFT_BUCKET_CASES = LATENCY_BUCKETS_MS.map(
+  (b) => `SUM(CASE WHEN ttft <= ${b} THEN 1 ELSE 0 END) AS bucket_${b}`,
+).join(",\n      ");
+
+/**
+ * Get completion duration histogram data grouped by model
+ * Duration is stored in milliseconds in the database
+ *
+ * Note: We use SUM(duration) not AVG because Prometheus histogram format requires
+ * the total sum of all observations (_sum metric). Average can be computed by
+ * Prometheus as sum/count when needed.
+ */
+export async function getCompletionDurationHistogram() {
+  logger.debug("getCompletionDurationHistogram");
+  const result = await db.execute(sql.raw(`
+    SELECT
+      model,
+      ${DURATION_BUCKET_CASES},
+      COUNT(*) AS total_count,
+      COALESCE(SUM(duration), 0) AS duration_sum
+    FROM completions
+    WHERE deleted = false AND duration > 0
+    GROUP BY model
+  `));
+  return result as unknown as Record<string, string>[];
+}
+
+/**
+ * Get completion TTFT (Time To First Token) histogram data grouped by model
+ * TTFT is stored in milliseconds in the database
+ */
+export async function getCompletionTTFTHistogram() {
+  logger.debug("getCompletionTTFTHistogram");
+  const result = await db.execute(sql.raw(`
+    SELECT
+      model,
+      ${TTFT_BUCKET_CASES},
+      COUNT(*) AS total_count,
+      COALESCE(SUM(ttft), 0) AS ttft_sum
+    FROM completions
+    WHERE deleted = false AND ttft > 0 AND status = 'completed'
+    GROUP BY model
+  `));
+  return result as unknown as Record<string, string>[];
+}
+
+/**
+ * Get embedding duration histogram data grouped by model
+ * Duration is stored in milliseconds in the database
+ */
+export async function getEmbeddingDurationHistogram() {
+  logger.debug("getEmbeddingDurationHistogram");
+  const result = await db.execute(sql.raw(`
+    SELECT
+      model,
+      ${DURATION_BUCKET_CASES},
+      COUNT(*) AS total_count,
+      COALESCE(SUM(duration), 0) AS duration_sum
+    FROM embeddings
+    WHERE deleted = false AND duration > 0
+    GROUP BY model
+  `));
+  return result as unknown as Record<string, string>[];
+}
+
+/**
+ * Get API key rate limit configuration for Prometheus metrics
+ * Returns all active (non-revoked) API keys with their rate limits
+ */
+export async function getApiKeyRateLimitConfig() {
+  logger.debug("getApiKeyRateLimitConfig");
+  return await db
+    .select({
+      id: schema.ApiKeysTable.id,
+      comment: schema.ApiKeysTable.comment,
+      rpmLimit: schema.ApiKeysTable.rpmLimit,
+      tpmLimit: schema.ApiKeysTable.tpmLimit,
+    })
+    .from(schema.ApiKeysTable)
+    .where(not(schema.ApiKeysTable.revoked));
+}
+
+/**
+ * Get counts of active entities for Prometheus gauges
+ * Uses a single query with subqueries for efficiency (one DB round-trip)
+ */
+export async function getActiveEntityCounts() {
+  logger.debug("getActiveEntityCounts");
+
+  const result = await db.execute(sql`
+    SELECT
+      (SELECT COUNT(*) FROM api_keys WHERE NOT revoked) AS api_keys,
+      (SELECT COUNT(*) FROM providers WHERE NOT deleted) AS providers,
+      (SELECT COUNT(*) FROM models WHERE NOT deleted AND model_type = 'chat') AS chat_models,
+      (SELECT COUNT(*) FROM models WHERE NOT deleted AND model_type = 'embedding') AS embedding_models
+  `);
+
+  const row = (result as unknown as Record<string, string>[])[0];
+  return {
+    apiKeys: Number(row?.api_keys ?? 0),
+    providers: Number(row?.providers ?? 0),
+    chatModels: Number(row?.chat_models ?? 0),
+    embeddingModels: Number(row?.embedding_models ?? 0),
+  };
+}
diff --git a/backend/src/index.ts b/backend/src/index.ts
@@ -16,6 +16,7 @@ async function exists(path: string): Promise<boolean> {
 }
 import { join } from "node:path";
 import { routes } from "@/api";
+import { metricsApi } from "@/api/metrics";
 import { loggerPlugin } from "@/plugins/loggerPlugin";
 import {
   ALLOWED_ORIGINS,
@@ -151,8 +152,8 @@ async function spaPlugin(dir: string) {
       if (path.startsWith("/docs") || path.startsWith("/__tsr")) {
         return status(404);
       }
-      // Skip API routes
-      if (path.startsWith("/api") || path.startsWith("/v1")) {
+      // Skip API routes and metrics (include trailing slash to prevent SPA fallback)
+      if (path.startsWith("/api") || path.startsWith("/v1") || path === "/metrics" || path === "/metrics/") {
         return status(404);
       }
 
@@ -205,6 +206,7 @@ const app = new Elysia()
   )
   .use(serverTiming())
   .use(routes)
+  .use(metricsApi)
   .use(await docsPlugin(DOCS_DIR))
   .use(await spaPlugin(FRONTEND_DIR))
   .listen({

diff --git a/backend/src/plugins/apiKeyRateLimitPlugin.ts b/backend/src/plugins/apiKeyRateLimitPlugin.ts
@@ -1,10 +1,43 @@
+import { consola } from "consola";
 import { Elysia } from "elysia";
 import { apiKeyPlugin } from "./apiKeyPlugin";
 import { checkRpmLimit, checkTpmLimit } from "@/utils/apiKeyRateLimit";
+import { redisClient } from "@/utils/redisClient";
 
 // Re-export consumeTokens for use in API handlers
 export { consumeTokens } from "@/utils/apiKeyRateLimit";
 
+const logger = consola.withTag("apiKeyRateLimitPlugin");
+
+// Redis key for tracking rate limit rejections (for Prometheus metrics)
+const RATE_LIMIT_REJECTIONS_KEY = "nexusgate:metrics:rate_limit_rejections";
+
+/**
+ * Track a rate limit rejection in Redis for Prometheus metrics
+ * @param apiKeyComment The API key comment for label
+ * @param limitType Type of limit exceeded ('rpm' or 'tpm')
+ */
+async function trackRateLimitRejection(
+  apiKeyComment: string | null,
+  limitType: "rpm" | "tpm",
+): Promise<void> {
+  try {
+    const field = `${apiKeyComment ?? "unknown"}:${limitType}`;
+    await redisClient.hincrby(RATE_LIMIT_REJECTIONS_KEY, field, 1);
+  } catch (error) {
+    logger.error("Failed to track rate limit rejection:", error);
+  }
+}
+
+/**
+ * Get all rate limit rejections from Redis for Prometheus metrics
+ */
+export async function getRateLimitRejections(): Promise<
+  Record<string, string>
+> {
+  return await redisClient.hgetall(RATE_LIMIT_REJECTIONS_KEY);
+}
+
 /**
  * OpenAI-compatible rate limit error response
  */
@@ -39,6 +72,9 @@ export const apiKeyRateLimitPlugin = new Elysia({
         );
 
         if (!rpmResult.allowed) {
+          // Track rejection for Prometheus metrics
+          await trackRateLimitRejection(apiKeyRecord.comment, "rpm");
+
           set.headers["X-RateLimit-Limit-RPM"] =
             apiKeyRecord.rpmLimit.toString();
           set.headers["X-RateLimit-Remaining-RPM"] = "0";
@@ -59,6 +95,9 @@ export const apiKeyRateLimitPlugin = new Elysia({
         );
 
         if (!tpmResult.allowed) {
+          // Track rejection for Prometheus metrics
+          await trackRateLimitRejection(apiKeyRecord.comment, "tpm");
+
           set.headers["X-RateLimit-Limit-TPM"] =
             apiKeyRecord.tpmLimit.toString();
           set.headers["X-RateLimit-Remaining-TPM"] = "0";