cobacious · cobacious · Jan 2, 2026 · Dec 16, 2025 · Dec 19, 2025
diff --git a/.github/workflows/run-pipeline.yml b/.github/workflows/run-pipeline.yml
@@ -79,7 +79,10 @@ jobs:
         env:
           DATABASE_URL: ${{ secrets.DATABASE_URL }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          TOKEN_LIMIT: 20000
+          # Pipeline limits (0 = unlimited)
+          MAX_EMBEDDINGS: 0
+          MAX_CONTENT_EXTRACTION: 0
+          TOKEN_LIMIT: 0
           NODE_ENV: production
 
       # Report pipeline completion

diff --git a/apps/engine/.env.example b/apps/engine/.env.example
@@ -1,16 +1,49 @@
 # Example environment variables for the Neus engine
 # Copy this file to `.env` in the same directory and fill in your values.
 
+# Database connection URL (PostgreSQL/Neon)
+DATABASE_URL=postgresql://user:password@host/database?sslmode=require
+
 # OpenAI API key used for embeddings and summarisation
-OPENAI_API_KEY=
+OPENAI_API_KEY=your-api-key-here
 
 # Log level for pipeline output (info, warn, error, debug)
 LOG_LEVEL=info
 
 # Node environment mode
 NODE_ENV=development
 
-# Cost controls
-TOKEN_LIMIT=5000
+# ============================================================================
+# Pipeline Limits (optional - set to 0 or omit for unlimited)
+# ============================================================================
+
+# Max embeddings to create per pipeline run
+# Cost: ~$0.00002 per article with text-embedding-3-small
+# Set to 0 for unlimited, or a number like 200 for safety
+# Default: 0 (unlimited)
+MAX_EMBEDDINGS=0
+
+# Max articles to extract full content for per run
+# No direct cost, but takes time (scraping websites)
+# Set to 0 for unlimited, or a number like 100 to limit processing time
+# Default: 0 (unlimited)
+MAX_CONTENT_EXTRACTION=0
+
+# Max tokens for summarization per pipeline run
+# Cost: ~$0.15 per 1M tokens with gpt-4o-mini
+# Set to 0 for unlimited, or a number like 5000 to cap costs
+# Default: 0 (unlimited)
+# Example: 5000 tokens ≈ 10-15 cluster summaries ≈ $0.001
+TOKEN_LIMIT=0
+
+# Model used for cluster summarization
+# Options: gpt-4o-mini (cheap), gpt-4o (expensive but better quality)
 SUMMARY_MODEL=gpt-4o-mini
-MAX_ARTICLES_TO_EMBED=100
+
+# ============================================================================
+# Typical costs for hobby use (processing ~500 articles):
+# - Embeddings: $0.01
+# - Summarization: $0.01
+# - Total per run: ~$0.02
+# Monthly (running weekly): ~$0.10
+# ============================================================================
diff --git a/apps/engine/core/pipeline/clusterArticles.test.ts b/apps/engine/core/pipeline/clusterArticles.test.ts
@@ -2,6 +2,7 @@ import { jest } from '@jest/globals';
 
 const mockDb = {
   getUnclusteredArticles: jest.fn(),
+  getRecentClustersWithEmbeddings: jest.fn(),
   createCluster: jest.fn(),
   createArticleAssignments: jest.fn(),
   updateClusterEmbedding: jest.fn(),
@@ -26,6 +27,7 @@ describe('clusterRecentArticles', () => {
       { id: 'a1', embedding: [1, 0] },
       { id: 'a2', embedding: [1, 0] },
     ]);
+    mockDb.getRecentClustersWithEmbeddings.mockResolvedValue([]);
     mockDb.createCluster.mockResolvedValue({ id: 'c1' });
 
     await clusterRecentArticles();
@@ -47,6 +49,7 @@ describe('clusterRecentArticles', () => {
       { id: 'a1', embedding: [1, 0], clusterAssignments: [] },
       { id: 'a2', embedding: [1, 0], clusterAssignments: [{ clusterId: 'old' }] },
     ]);
+    mockDb.getRecentClustersWithEmbeddings.mockResolvedValue([]);
     mockDb.createCluster.mockResolvedValue({ id: 'c1' });
 
     await clusterRecentArticles();

diff --git a/apps/engine/core/pipeline/embedArticles.ts b/apps/engine/core/pipeline/embedArticles.ts
@@ -13,7 +13,10 @@ import {
 const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
 const EMBEDDING_MODEL = 'text-embedding-3-small';
 const MAX_EMBEDDING_CHARS = 8192;
-const MAX_EMBEDDINGS_PER_RUN = 200; // Safety limit to prevent runaway costs (~$0.002/run)
+// Configurable via MAX_EMBEDDINGS env var. Set to 0 or omit for unlimited.
+const MAX_EMBEDDINGS_PER_RUN = process.env.MAX_EMBEDDINGS
+  ? parseInt(process.env.MAX_EMBEDDINGS, 10)
+  : 0; // 0 = unlimited
 
 export async function embedNewArticles() {
   logPipelineStep(PipelineStep.Embed, 'Embedding new articles...');
@@ -22,12 +25,13 @@ export async function embedNewArticles() {
 
   logPipelineSection(PipelineStep.Embed, `Found ${unembedded.length} unembedded articles`);
 
-  // Apply safety limit
-  const articlesToEmbed = unembedded.slice(0, MAX_EMBEDDINGS_PER_RUN);
-  if (unembedded.length > MAX_EMBEDDINGS_PER_RUN) {
+  // Apply limit if configured
+  const articlesToEmbed =
+    MAX_EMBEDDINGS_PER_RUN > 0 ? unembedded.slice(0, MAX_EMBEDDINGS_PER_RUN) : unembedded;
+  if (MAX_EMBEDDINGS_PER_RUN > 0 && unembedded.length > MAX_EMBEDDINGS_PER_RUN) {
     logPipelineSection(
       PipelineStep.Embed,
-      `Limiting to ${MAX_EMBEDDINGS_PER_RUN} articles to prevent excessive API costs`
+      `Limiting to ${MAX_EMBEDDINGS_PER_RUN} articles (set MAX_EMBEDDINGS=0 for unlimited)`
     );
   }
 

diff --git a/apps/engine/core/pipeline/summarizeClusters.ts b/apps/engine/core/pipeline/summarizeClusters.ts
@@ -52,8 +52,10 @@ export async function summarizeClusters() {
       totalTokensUsed += usage;
       console.log(`Cluster ${cluster.id} used ${usage} tokens (total: ${totalTokensUsed})`);
 
-      if (process.env.TOKEN_LIMIT && totalTokensUsed > parseInt(process.env.TOKEN_LIMIT)) {
-        console.warn('Token cap reached. Aborting summarisation.');
+      // Check token limit if configured (0 or omitted = unlimited)
+      const tokenLimit = process.env.TOKEN_LIMIT ? parseInt(process.env.TOKEN_LIMIT, 10) : 0;
+      if (tokenLimit > 0 && totalTokensUsed > tokenLimit) {
+        console.warn(`Token limit (${tokenLimit}) reached. Aborting summarisation.`);
         break;
       }
 

diff --git a/packages/db/src/articles/getArticlesMissingContent.ts b/packages/db/src/articles/getArticlesMissingContent.ts
@@ -5,7 +5,7 @@ import { prisma } from '../client';
  *
  * Optimizations:
  * - Only fetches articles from the last 7 days (failed extractions don't retry forever)
- * - Limits to 100 articles per run to prevent slow accumulation
+ * - Limits articles per run if MAX_CONTENT_EXTRACTION is set (set to 0 for unlimited)
  * - Orders by createdAt DESC to prioritize newest articles
  *
  * This prevents the pipeline from retrying failed content extractions indefinitely
@@ -15,7 +15,11 @@ export async function getArticlesMissingContent() {
   const sevenDaysAgo = new Date();
   sevenDaysAgo.setDate(sevenDaysAgo.getDate() - 7);
 
-  return prisma.article.findMany({
+  const maxArticles = process.env.MAX_CONTENT_EXTRACTION
+    ? parseInt(process.env.MAX_CONTENT_EXTRACTION, 10)
+    : 0; // 0 = unlimited
+
+  const query: any = {
     where: {
       OR: [{ content: null }, { content: '' }],
       createdAt: {
@@ -25,6 +29,12 @@ export async function getArticlesMissingContent() {
     orderBy: {
       createdAt: 'desc',
     },
-    take: 100,
-  });
+  };
+
+  // Only add limit if configured
+  if (maxArticles > 0) {
+    query.take = maxArticles;
+  }
+
+  return prisma.article.findMany(query);
 }
diff --git a/packages/db/src/articles/getUnclusteredArticles.ts b/packages/db/src/articles/getUnclusteredArticles.ts
@@ -25,6 +25,11 @@ export async function getUnclusteredArticles(daysBack: number = 3) {
         gte: cutoffDate,
       },
     },
+    select: {
+      id: true,
+      embedding: true,
+      createdAt: true,
+    },
     orderBy: {
       createdAt: 'desc',
     },

diff --git a/packages/db/src/articles/getUnembeddedArticles.ts b/packages/db/src/articles/getUnembeddedArticles.ts
@@ -4,5 +4,12 @@ import { Prisma } from '@prisma/client';
 export async function getUnembeddedArticles() {
   return prisma.article.findMany({
     where: { embedding: { equals: Prisma.DbNull } },
+    select: {
+      id: true,
+      title: true,
+      content: true,
+      snippet: true,
+      url: true,
+    },
   });
 }
diff --git a/packages/db/src/clusters/getRankedClusters.test.ts b/packages/db/src/clusters/getRankedClusters.test.ts
@@ -14,10 +14,36 @@ describe('getRankedClusters', () => {
     findMany.mockResolvedValue([]);
     await getRankedClusters();
     expect(findMany).toHaveBeenCalledWith({
+      where: {
+        AND: [
+          { headline: { not: null } },
+          { headline: { not: '' } },
+          { summary: { not: null } },
+          { summary: { not: '' } },
+          { archived: false },
+        ],
+      },
       include: {
         articleAssignments: {
-          include: {
-            article: { include: { sourceRel: true } },
+          select: {
+            createdAt: true,
+            article: {
+              select: {
+                id: true,
+                url: true,
+                title: true,
+                source: true,
+                publishedAt: true,
+                author: true,
+                sourceRel: {
+                  select: {
+                    id: true,
+                    name: true,
+                    faviconUrl: true,
+                  },
+                },
+              },
+            },
           },
         },
       },