diff --git a/.github/workflows/run-pipeline.yml b/.github/workflows/run-pipeline.yml index 4bc90a4..57589f2 100644 --- a/.github/workflows/run-pipeline.yml +++ b/.github/workflows/run-pipeline.yml @@ -79,7 +79,10 @@ jobs: env: DATABASE_URL: ${{ secrets.DATABASE_URL }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - TOKEN_LIMIT: 20000 + # Pipeline limits (0 = unlimited) + MAX_EMBEDDINGS: 0 + MAX_CONTENT_EXTRACTION: 0 + TOKEN_LIMIT: 0 NODE_ENV: production # Report pipeline completion diff --git a/apps/engine/.env.example b/apps/engine/.env.example index cb4275a..7acae97 100644 --- a/apps/engine/.env.example +++ b/apps/engine/.env.example @@ -1,8 +1,11 @@ # Example environment variables for the Neus engine # Copy this file to `.env` in the same directory and fill in your values. +# Database connection URL (PostgreSQL/Neon) +DATABASE_URL=postgresql://user:password@host/database?sslmode=require + # OpenAI API key used for embeddings and summarisation -OPENAI_API_KEY= +OPENAI_API_KEY=your-api-key-here # Log level for pipeline output (info, warn, error, debug) LOG_LEVEL=info @@ -10,7 +13,37 @@ LOG_LEVEL=info # Node environment mode NODE_ENV=development -# Cost controls -TOKEN_LIMIT=5000 +# ============================================================================ +# Pipeline Limits (optional - set to 0 or omit for unlimited) +# ============================================================================ + +# Max embeddings to create per pipeline run +# Cost: ~$0.00002 per article with text-embedding-3-small +# Set to 0 for unlimited, or a number like 200 for safety +# Default: 0 (unlimited) +MAX_EMBEDDINGS=0 + +# Max articles to extract full content for per run +# No direct cost, but takes time (scraping websites) +# Set to 0 for unlimited, or a number like 100 to limit processing time +# Default: 0 (unlimited) +MAX_CONTENT_EXTRACTION=0 + +# Max tokens for summarization per pipeline run +# Cost: ~$0.15 per 1M tokens with gpt-4o-mini +# Set to 0 for unlimited, or a number like 5000 to cap costs +# Default: 0 (unlimited) +# Example: 5000 tokens ≈ 10-15 cluster summaries ≈ $0.001 +TOKEN_LIMIT=0 + +# Model used for cluster summarization +# Options: gpt-4o-mini (cheap), gpt-4o (expensive but better quality) SUMMARY_MODEL=gpt-4o-mini -MAX_ARTICLES_TO_EMBED=100 + +# ============================================================================ +# Typical costs for hobby use (processing ~500 articles): +# - Embeddings: $0.01 +# - Summarization: $0.01 +# - Total per run: ~$0.02 +# Monthly (running weekly): ~$0.10 +# ============================================================================ diff --git a/apps/engine/core/pipeline/clusterArticles.test.ts b/apps/engine/core/pipeline/clusterArticles.test.ts index 66a8ad6..86501c1 100644 --- a/apps/engine/core/pipeline/clusterArticles.test.ts +++ b/apps/engine/core/pipeline/clusterArticles.test.ts @@ -2,6 +2,7 @@ import { jest } from '@jest/globals'; const mockDb = { getUnclusteredArticles: jest.fn(), + getRecentClustersWithEmbeddings: jest.fn(), createCluster: jest.fn(), createArticleAssignments: jest.fn(), updateClusterEmbedding: jest.fn(), @@ -26,6 +27,7 @@ describe('clusterRecentArticles', () => { { id: 'a1', embedding: [1, 0] }, { id: 'a2', embedding: [1, 0] }, ]); + mockDb.getRecentClustersWithEmbeddings.mockResolvedValue([]); mockDb.createCluster.mockResolvedValue({ id: 'c1' }); await clusterRecentArticles(); @@ -47,6 +49,7 @@ describe('clusterRecentArticles', () => { { id: 'a1', embedding: [1, 0], clusterAssignments: [] }, { id: 'a2', embedding: [1, 0], clusterAssignments: [{ clusterId: 'old' }] }, ]); + mockDb.getRecentClustersWithEmbeddings.mockResolvedValue([]); mockDb.createCluster.mockResolvedValue({ id: 'c1' }); await clusterRecentArticles(); diff --git a/apps/engine/core/pipeline/embedArticles.ts b/apps/engine/core/pipeline/embedArticles.ts index e48950e..5f28eb5 100644 --- a/apps/engine/core/pipeline/embedArticles.ts +++ b/apps/engine/core/pipeline/embedArticles.ts @@ -13,7 +13,10 @@ import { const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); const EMBEDDING_MODEL = 'text-embedding-3-small'; const MAX_EMBEDDING_CHARS = 8192; -const MAX_EMBEDDINGS_PER_RUN = 200; // Safety limit to prevent runaway costs (~$0.002/run) +// Configurable via MAX_EMBEDDINGS env var. Set to 0 or omit for unlimited. +const MAX_EMBEDDINGS_PER_RUN = process.env.MAX_EMBEDDINGS + ? parseInt(process.env.MAX_EMBEDDINGS, 10) + : 0; // 0 = unlimited export async function embedNewArticles() { logPipelineStep(PipelineStep.Embed, 'Embedding new articles...'); @@ -22,12 +25,13 @@ export async function embedNewArticles() { logPipelineSection(PipelineStep.Embed, `Found ${unembedded.length} unembedded articles`); - // Apply safety limit - const articlesToEmbed = unembedded.slice(0, MAX_EMBEDDINGS_PER_RUN); - if (unembedded.length > MAX_EMBEDDINGS_PER_RUN) { + // Apply limit if configured + const articlesToEmbed = + MAX_EMBEDDINGS_PER_RUN > 0 ? unembedded.slice(0, MAX_EMBEDDINGS_PER_RUN) : unembedded; + if (MAX_EMBEDDINGS_PER_RUN > 0 && unembedded.length > MAX_EMBEDDINGS_PER_RUN) { logPipelineSection( PipelineStep.Embed, - `Limiting to ${MAX_EMBEDDINGS_PER_RUN} articles to prevent excessive API costs` + `Limiting to ${MAX_EMBEDDINGS_PER_RUN} articles (set MAX_EMBEDDINGS=0 for unlimited)` ); } diff --git a/apps/engine/core/pipeline/summarizeClusters.ts b/apps/engine/core/pipeline/summarizeClusters.ts index 1474076..4b949c1 100644 --- a/apps/engine/core/pipeline/summarizeClusters.ts +++ b/apps/engine/core/pipeline/summarizeClusters.ts @@ -52,8 +52,10 @@ export async function summarizeClusters() { totalTokensUsed += usage; console.log(`Cluster ${cluster.id} used ${usage} tokens (total: ${totalTokensUsed})`); - if (process.env.TOKEN_LIMIT && totalTokensUsed > parseInt(process.env.TOKEN_LIMIT)) { - console.warn('Token cap reached. Aborting summarisation.'); + // Check token limit if configured (0 or omitted = unlimited) + const tokenLimit = process.env.TOKEN_LIMIT ? parseInt(process.env.TOKEN_LIMIT, 10) : 0; + if (tokenLimit > 0 && totalTokensUsed > tokenLimit) { + console.warn(`Token limit (${tokenLimit}) reached. Aborting summarisation.`); break; } diff --git a/packages/db/src/articles/getArticlesMissingContent.ts b/packages/db/src/articles/getArticlesMissingContent.ts index d4db5de..8abccd9 100644 --- a/packages/db/src/articles/getArticlesMissingContent.ts +++ b/packages/db/src/articles/getArticlesMissingContent.ts @@ -5,7 +5,7 @@ import { prisma } from '../client'; * * Optimizations: * - Only fetches articles from the last 7 days (failed extractions don't retry forever) - * - Limits to 100 articles per run to prevent slow accumulation + * - Limits articles per run if MAX_CONTENT_EXTRACTION is set (set to 0 for unlimited) * - Orders by createdAt DESC to prioritize newest articles * * This prevents the pipeline from retrying failed content extractions indefinitely @@ -15,7 +15,11 @@ export async function getArticlesMissingContent() { const sevenDaysAgo = new Date(); sevenDaysAgo.setDate(sevenDaysAgo.getDate() - 7); - return prisma.article.findMany({ + const maxArticles = process.env.MAX_CONTENT_EXTRACTION + ? parseInt(process.env.MAX_CONTENT_EXTRACTION, 10) + : 0; // 0 = unlimited + + const query: any = { where: { OR: [{ content: null }, { content: '' }], createdAt: { @@ -25,6 +29,12 @@ export async function getArticlesMissingContent() { orderBy: { createdAt: 'desc', }, - take: 100, - }); + }; + + // Only add limit if configured + if (maxArticles > 0) { + query.take = maxArticles; + } + + return prisma.article.findMany(query); } diff --git a/packages/db/src/articles/getUnclusteredArticles.ts b/packages/db/src/articles/getUnclusteredArticles.ts index f49bea3..09b64ee 100644 --- a/packages/db/src/articles/getUnclusteredArticles.ts +++ b/packages/db/src/articles/getUnclusteredArticles.ts @@ -25,6 +25,11 @@ export async function getUnclusteredArticles(daysBack: number = 3) { gte: cutoffDate, }, }, + select: { + id: true, + embedding: true, + createdAt: true, + }, orderBy: { createdAt: 'desc', }, diff --git a/packages/db/src/articles/getUnembeddedArticles.ts b/packages/db/src/articles/getUnembeddedArticles.ts index 8b22824..90cc8a3 100644 --- a/packages/db/src/articles/getUnembeddedArticles.ts +++ b/packages/db/src/articles/getUnembeddedArticles.ts @@ -4,5 +4,12 @@ import { Prisma } from '@prisma/client'; export async function getUnembeddedArticles() { return prisma.article.findMany({ where: { embedding: { equals: Prisma.DbNull } }, + select: { + id: true, + title: true, + content: true, + snippet: true, + url: true, + }, }); } diff --git a/packages/db/src/clusters/getRankedClusters.test.ts b/packages/db/src/clusters/getRankedClusters.test.ts index a2aa4d1..23091b0 100644 --- a/packages/db/src/clusters/getRankedClusters.test.ts +++ b/packages/db/src/clusters/getRankedClusters.test.ts @@ -14,10 +14,36 @@ describe('getRankedClusters', () => { findMany.mockResolvedValue([]); await getRankedClusters(); expect(findMany).toHaveBeenCalledWith({ + where: { + AND: [ + { headline: { not: null } }, + { headline: { not: '' } }, + { summary: { not: null } }, + { summary: { not: '' } }, + { archived: false }, + ], + }, include: { articleAssignments: { - include: { - article: { include: { sourceRel: true } }, + select: { + createdAt: true, + article: { + select: { + id: true, + url: true, + title: true, + source: true, + publishedAt: true, + author: true, + sourceRel: { + select: { + id: true, + name: true, + faviconUrl: true, + }, + }, + }, + }, }, }, },