From a6ea29cd609134bd5b0fcb4b31bd62f77f579733 Mon Sep 17 00:00:00 2001 From: Jacob Walton Date: Tue, 16 Dec 2025 14:27:31 +0000 Subject: [PATCH 1/2] feat(db): optimize queries to reduce data transfer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Optimized database queries to fetch only necessary fields, significantly reducing egress data transfer to stay within free tier limits. Changes: - getUnembeddedArticles: only fetch id, title, content, snippet, url - getUnclusteredArticles: only fetch id, embedding, createdAt - Updated tests to match optimized query structures Impact: ~90-95% reduction in data transfer per query, which should help avoid exceeding the 5GB/month egress limit on Neon/Supabase free tiers. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../core/pipeline/clusterArticles.test.ts | 3 ++ .../db/src/articles/getUnclusteredArticles.ts | 5 ++++ .../db/src/articles/getUnembeddedArticles.ts | 7 +++++ .../db/src/clusters/getRankedClusters.test.ts | 30 +++++++++++++++++-- 4 files changed, 43 insertions(+), 2 deletions(-) diff --git a/apps/engine/core/pipeline/clusterArticles.test.ts b/apps/engine/core/pipeline/clusterArticles.test.ts index 66a8ad6..86501c1 100644 --- a/apps/engine/core/pipeline/clusterArticles.test.ts +++ b/apps/engine/core/pipeline/clusterArticles.test.ts @@ -2,6 +2,7 @@ import { jest } from '@jest/globals'; const mockDb = { getUnclusteredArticles: jest.fn(), + getRecentClustersWithEmbeddings: jest.fn(), createCluster: jest.fn(), createArticleAssignments: jest.fn(), updateClusterEmbedding: jest.fn(), @@ -26,6 +27,7 @@ describe('clusterRecentArticles', () => { { id: 'a1', embedding: [1, 0] }, { id: 'a2', embedding: [1, 0] }, ]); + mockDb.getRecentClustersWithEmbeddings.mockResolvedValue([]); mockDb.createCluster.mockResolvedValue({ id: 'c1' }); await clusterRecentArticles(); @@ -47,6 +49,7 @@ describe('clusterRecentArticles', () => { { id: 'a1', embedding: [1, 0], clusterAssignments: [] }, { id: 'a2', embedding: [1, 0], clusterAssignments: [{ clusterId: 'old' }] }, ]); + mockDb.getRecentClustersWithEmbeddings.mockResolvedValue([]); mockDb.createCluster.mockResolvedValue({ id: 'c1' }); await clusterRecentArticles(); diff --git a/packages/db/src/articles/getUnclusteredArticles.ts b/packages/db/src/articles/getUnclusteredArticles.ts index f49bea3..09b64ee 100644 --- a/packages/db/src/articles/getUnclusteredArticles.ts +++ b/packages/db/src/articles/getUnclusteredArticles.ts @@ -25,6 +25,11 @@ export async function getUnclusteredArticles(daysBack: number = 3) { gte: cutoffDate, }, }, + select: { + id: true, + embedding: true, + createdAt: true, + }, orderBy: { createdAt: 'desc', }, diff --git a/packages/db/src/articles/getUnembeddedArticles.ts b/packages/db/src/articles/getUnembeddedArticles.ts index 8b22824..90cc8a3 100644 --- a/packages/db/src/articles/getUnembeddedArticles.ts +++ b/packages/db/src/articles/getUnembeddedArticles.ts @@ -4,5 +4,12 @@ import { Prisma } from '@prisma/client'; export async function getUnembeddedArticles() { return prisma.article.findMany({ where: { embedding: { equals: Prisma.DbNull } }, + select: { + id: true, + title: true, + content: true, + snippet: true, + url: true, + }, }); } diff --git a/packages/db/src/clusters/getRankedClusters.test.ts b/packages/db/src/clusters/getRankedClusters.test.ts index a2aa4d1..23091b0 100644 --- a/packages/db/src/clusters/getRankedClusters.test.ts +++ b/packages/db/src/clusters/getRankedClusters.test.ts @@ -14,10 +14,36 @@ describe('getRankedClusters', () => { findMany.mockResolvedValue([]); await getRankedClusters(); expect(findMany).toHaveBeenCalledWith({ + where: { + AND: [ + { headline: { not: null } }, + { headline: { not: '' } }, + { summary: { not: null } }, + { summary: { not: '' } }, + { archived: false }, + ], + }, include: { articleAssignments: { - include: { - article: { include: { sourceRel: true } }, + select: { + createdAt: true, + article: { + select: { + id: true, + url: true, + title: true, + source: true, + publishedAt: true, + author: true, + sourceRel: { + select: { + id: true, + name: true, + faviconUrl: true, + }, + }, + }, + }, }, }, }, From 2934c535ea82f2672447df9754b41bae37fbac8a Mon Sep 17 00:00:00 2001 From: Jacob Walton Date: Fri, 19 Dec 2025 16:45:09 +0000 Subject: [PATCH 2/2] feat(engine): make pipeline limits configurable via environment variables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make all pipeline processing limits configurable via environment variables instead of hardcoded constants. This allows flexible configuration for different use cases (local development, production, bulk processing). Changes: - embedArticles.ts: MAX_EMBEDDINGS env var (0 = unlimited) - getArticlesMissingContent.ts: MAX_CONTENT_EXTRACTION env var (0 = unlimited) - summarizeClusters.ts: Fix TOKEN_LIMIT to properly handle 0 as unlimited - .env.example: Document all pipeline limits with cost estimates - run-pipeline.yml: Set limits to 0 (unlimited) for GitHub Actions For hobby use, unlimited processing costs ~$0.02 per run (~$0.10/month if run weekly). The configurable limits allow adding safety caps if needed for production use. Rationale: The previous hardcoded limits (200 embeddings, 100 content extractions) were causing incomplete processing when there was a backlog, requiring multiple runs to process all articles. For a hobby project with infrequent runs, it's simpler to process everything in one go. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .github/workflows/run-pipeline.yml | 5 ++- apps/engine/.env.example | 41 +++++++++++++++++-- apps/engine/core/pipeline/embedArticles.ts | 14 ++++--- .../engine/core/pipeline/summarizeClusters.ts | 6 ++- .../src/articles/getArticlesMissingContent.ts | 18 ++++++-- 5 files changed, 68 insertions(+), 16 deletions(-) diff --git a/.github/workflows/run-pipeline.yml b/.github/workflows/run-pipeline.yml index 4bc90a4..57589f2 100644 --- a/.github/workflows/run-pipeline.yml +++ b/.github/workflows/run-pipeline.yml @@ -79,7 +79,10 @@ jobs: env: DATABASE_URL: ${{ secrets.DATABASE_URL }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - TOKEN_LIMIT: 20000 + # Pipeline limits (0 = unlimited) + MAX_EMBEDDINGS: 0 + MAX_CONTENT_EXTRACTION: 0 + TOKEN_LIMIT: 0 NODE_ENV: production # Report pipeline completion diff --git a/apps/engine/.env.example b/apps/engine/.env.example index cb4275a..7acae97 100644 --- a/apps/engine/.env.example +++ b/apps/engine/.env.example @@ -1,8 +1,11 @@ # Example environment variables for the Neus engine # Copy this file to `.env` in the same directory and fill in your values. +# Database connection URL (PostgreSQL/Neon) +DATABASE_URL=postgresql://user:password@host/database?sslmode=require + # OpenAI API key used for embeddings and summarisation -OPENAI_API_KEY= +OPENAI_API_KEY=your-api-key-here # Log level for pipeline output (info, warn, error, debug) LOG_LEVEL=info @@ -10,7 +13,37 @@ LOG_LEVEL=info # Node environment mode NODE_ENV=development -# Cost controls -TOKEN_LIMIT=5000 +# ============================================================================ +# Pipeline Limits (optional - set to 0 or omit for unlimited) +# ============================================================================ + +# Max embeddings to create per pipeline run +# Cost: ~$0.00002 per article with text-embedding-3-small +# Set to 0 for unlimited, or a number like 200 for safety +# Default: 0 (unlimited) +MAX_EMBEDDINGS=0 + +# Max articles to extract full content for per run +# No direct cost, but takes time (scraping websites) +# Set to 0 for unlimited, or a number like 100 to limit processing time +# Default: 0 (unlimited) +MAX_CONTENT_EXTRACTION=0 + +# Max tokens for summarization per pipeline run +# Cost: ~$0.15 per 1M tokens with gpt-4o-mini +# Set to 0 for unlimited, or a number like 5000 to cap costs +# Default: 0 (unlimited) +# Example: 5000 tokens ≈ 10-15 cluster summaries ≈ $0.001 +TOKEN_LIMIT=0 + +# Model used for cluster summarization +# Options: gpt-4o-mini (cheap), gpt-4o (expensive but better quality) SUMMARY_MODEL=gpt-4o-mini -MAX_ARTICLES_TO_EMBED=100 + +# ============================================================================ +# Typical costs for hobby use (processing ~500 articles): +# - Embeddings: $0.01 +# - Summarization: $0.01 +# - Total per run: ~$0.02 +# Monthly (running weekly): ~$0.10 +# ============================================================================ diff --git a/apps/engine/core/pipeline/embedArticles.ts b/apps/engine/core/pipeline/embedArticles.ts index e48950e..5f28eb5 100644 --- a/apps/engine/core/pipeline/embedArticles.ts +++ b/apps/engine/core/pipeline/embedArticles.ts @@ -13,7 +13,10 @@ import { const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); const EMBEDDING_MODEL = 'text-embedding-3-small'; const MAX_EMBEDDING_CHARS = 8192; -const MAX_EMBEDDINGS_PER_RUN = 200; // Safety limit to prevent runaway costs (~$0.002/run) +// Configurable via MAX_EMBEDDINGS env var. Set to 0 or omit for unlimited. +const MAX_EMBEDDINGS_PER_RUN = process.env.MAX_EMBEDDINGS + ? parseInt(process.env.MAX_EMBEDDINGS, 10) + : 0; // 0 = unlimited export async function embedNewArticles() { logPipelineStep(PipelineStep.Embed, 'Embedding new articles...'); @@ -22,12 +25,13 @@ export async function embedNewArticles() { logPipelineSection(PipelineStep.Embed, `Found ${unembedded.length} unembedded articles`); - // Apply safety limit - const articlesToEmbed = unembedded.slice(0, MAX_EMBEDDINGS_PER_RUN); - if (unembedded.length > MAX_EMBEDDINGS_PER_RUN) { + // Apply limit if configured + const articlesToEmbed = + MAX_EMBEDDINGS_PER_RUN > 0 ? unembedded.slice(0, MAX_EMBEDDINGS_PER_RUN) : unembedded; + if (MAX_EMBEDDINGS_PER_RUN > 0 && unembedded.length > MAX_EMBEDDINGS_PER_RUN) { logPipelineSection( PipelineStep.Embed, - `Limiting to ${MAX_EMBEDDINGS_PER_RUN} articles to prevent excessive API costs` + `Limiting to ${MAX_EMBEDDINGS_PER_RUN} articles (set MAX_EMBEDDINGS=0 for unlimited)` ); } diff --git a/apps/engine/core/pipeline/summarizeClusters.ts b/apps/engine/core/pipeline/summarizeClusters.ts index 1474076..4b949c1 100644 --- a/apps/engine/core/pipeline/summarizeClusters.ts +++ b/apps/engine/core/pipeline/summarizeClusters.ts @@ -52,8 +52,10 @@ export async function summarizeClusters() { totalTokensUsed += usage; console.log(`Cluster ${cluster.id} used ${usage} tokens (total: ${totalTokensUsed})`); - if (process.env.TOKEN_LIMIT && totalTokensUsed > parseInt(process.env.TOKEN_LIMIT)) { - console.warn('Token cap reached. Aborting summarisation.'); + // Check token limit if configured (0 or omitted = unlimited) + const tokenLimit = process.env.TOKEN_LIMIT ? parseInt(process.env.TOKEN_LIMIT, 10) : 0; + if (tokenLimit > 0 && totalTokensUsed > tokenLimit) { + console.warn(`Token limit (${tokenLimit}) reached. Aborting summarisation.`); break; } diff --git a/packages/db/src/articles/getArticlesMissingContent.ts b/packages/db/src/articles/getArticlesMissingContent.ts index d4db5de..8abccd9 100644 --- a/packages/db/src/articles/getArticlesMissingContent.ts +++ b/packages/db/src/articles/getArticlesMissingContent.ts @@ -5,7 +5,7 @@ import { prisma } from '../client'; * * Optimizations: * - Only fetches articles from the last 7 days (failed extractions don't retry forever) - * - Limits to 100 articles per run to prevent slow accumulation + * - Limits articles per run if MAX_CONTENT_EXTRACTION is set (set to 0 for unlimited) * - Orders by createdAt DESC to prioritize newest articles * * This prevents the pipeline from retrying failed content extractions indefinitely @@ -15,7 +15,11 @@ export async function getArticlesMissingContent() { const sevenDaysAgo = new Date(); sevenDaysAgo.setDate(sevenDaysAgo.getDate() - 7); - return prisma.article.findMany({ + const maxArticles = process.env.MAX_CONTENT_EXTRACTION + ? parseInt(process.env.MAX_CONTENT_EXTRACTION, 10) + : 0; // 0 = unlimited + + const query: any = { where: { OR: [{ content: null }, { content: '' }], createdAt: { @@ -25,6 +29,12 @@ export async function getArticlesMissingContent() { orderBy: { createdAt: 'desc', }, - take: 100, - }); + }; + + // Only add limit if configured + if (maxArticles > 0) { + query.take = maxArticles; + } + + return prisma.article.findMany(query); }