Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/run-pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,10 @@ jobs:
env:
DATABASE_URL: ${{ secrets.DATABASE_URL }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
TOKEN_LIMIT: 20000
# Pipeline limits (0 = unlimited)
MAX_EMBEDDINGS: 0
MAX_CONTENT_EXTRACTION: 0
TOKEN_LIMIT: 0
NODE_ENV: production

# Report pipeline completion
Expand Down
41 changes: 37 additions & 4 deletions apps/engine/.env.example
Original file line number Diff line number Diff line change
@@ -1,16 +1,49 @@
# Example environment variables for the Neus engine
# Copy this file to `.env` in the same directory and fill in your values.

# Database connection URL (PostgreSQL/Neon)
DATABASE_URL=postgresql://user:password@host/database?sslmode=require

# OpenAI API key used for embeddings and summarisation
OPENAI_API_KEY=
OPENAI_API_KEY=your-api-key-here

# Log level for pipeline output (info, warn, error, debug)
LOG_LEVEL=info

# Node environment mode
NODE_ENV=development

# Cost controls
TOKEN_LIMIT=5000
# ============================================================================
# Pipeline Limits (optional - set to 0 or omit for unlimited)
# ============================================================================

# Max embeddings to create per pipeline run
# Cost: ~$0.00002 per article with text-embedding-3-small
# Set to 0 for unlimited, or a number like 200 for safety
# Default: 0 (unlimited)
MAX_EMBEDDINGS=0

# Max articles to extract full content for per run
# No direct cost, but takes time (scraping websites)
# Set to 0 for unlimited, or a number like 100 to limit processing time
# Default: 0 (unlimited)
MAX_CONTENT_EXTRACTION=0

# Max tokens for summarization per pipeline run
# Cost: ~$0.15 per 1M tokens with gpt-4o-mini
# Set to 0 for unlimited, or a number like 5000 to cap costs
# Default: 0 (unlimited)
# Example: 5000 tokens ≈ 10-15 cluster summaries ≈ $0.001
TOKEN_LIMIT=0

# Model used for cluster summarization
# Options: gpt-4o-mini (cheap), gpt-4o (expensive but better quality)
SUMMARY_MODEL=gpt-4o-mini
MAX_ARTICLES_TO_EMBED=100

# ============================================================================
# Typical costs for hobby use (processing ~500 articles):
# - Embeddings: $0.01
# - Summarization: $0.01
# - Total per run: ~$0.02
# Monthly (running weekly): ~$0.10
# ============================================================================
3 changes: 3 additions & 0 deletions apps/engine/core/pipeline/clusterArticles.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { jest } from '@jest/globals';

const mockDb = {
getUnclusteredArticles: jest.fn(),
getRecentClustersWithEmbeddings: jest.fn(),
createCluster: jest.fn(),
createArticleAssignments: jest.fn(),
updateClusterEmbedding: jest.fn(),
Expand All @@ -26,6 +27,7 @@ describe('clusterRecentArticles', () => {
{ id: 'a1', embedding: [1, 0] },
{ id: 'a2', embedding: [1, 0] },
]);
mockDb.getRecentClustersWithEmbeddings.mockResolvedValue([]);
mockDb.createCluster.mockResolvedValue({ id: 'c1' });

await clusterRecentArticles();
Expand All @@ -47,6 +49,7 @@ describe('clusterRecentArticles', () => {
{ id: 'a1', embedding: [1, 0], clusterAssignments: [] },
{ id: 'a2', embedding: [1, 0], clusterAssignments: [{ clusterId: 'old' }] },
]);
mockDb.getRecentClustersWithEmbeddings.mockResolvedValue([]);
mockDb.createCluster.mockResolvedValue({ id: 'c1' });

await clusterRecentArticles();
Expand Down
14 changes: 9 additions & 5 deletions apps/engine/core/pipeline/embedArticles.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,10 @@ import {
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
const EMBEDDING_MODEL = 'text-embedding-3-small';
const MAX_EMBEDDING_CHARS = 8192;
const MAX_EMBEDDINGS_PER_RUN = 200; // Safety limit to prevent runaway costs (~$0.002/run)
// Configurable via MAX_EMBEDDINGS env var. Set to 0 or omit for unlimited.
const MAX_EMBEDDINGS_PER_RUN = process.env.MAX_EMBEDDINGS
? parseInt(process.env.MAX_EMBEDDINGS, 10)
: 0; // 0 = unlimited

export async function embedNewArticles() {
logPipelineStep(PipelineStep.Embed, 'Embedding new articles...');
Expand All @@ -22,12 +25,13 @@ export async function embedNewArticles() {

logPipelineSection(PipelineStep.Embed, `Found ${unembedded.length} unembedded articles`);

// Apply safety limit
const articlesToEmbed = unembedded.slice(0, MAX_EMBEDDINGS_PER_RUN);
if (unembedded.length > MAX_EMBEDDINGS_PER_RUN) {
// Apply limit if configured
const articlesToEmbed =
MAX_EMBEDDINGS_PER_RUN > 0 ? unembedded.slice(0, MAX_EMBEDDINGS_PER_RUN) : unembedded;
if (MAX_EMBEDDINGS_PER_RUN > 0 && unembedded.length > MAX_EMBEDDINGS_PER_RUN) {
logPipelineSection(
PipelineStep.Embed,
`Limiting to ${MAX_EMBEDDINGS_PER_RUN} articles to prevent excessive API costs`
`Limiting to ${MAX_EMBEDDINGS_PER_RUN} articles (set MAX_EMBEDDINGS=0 for unlimited)`
);
}

Expand Down
6 changes: 4 additions & 2 deletions apps/engine/core/pipeline/summarizeClusters.ts
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,10 @@ export async function summarizeClusters() {
totalTokensUsed += usage;
console.log(`Cluster ${cluster.id} used ${usage} tokens (total: ${totalTokensUsed})`);

if (process.env.TOKEN_LIMIT && totalTokensUsed > parseInt(process.env.TOKEN_LIMIT)) {
console.warn('Token cap reached. Aborting summarisation.');
// Check token limit if configured (0 or omitted = unlimited)
const tokenLimit = process.env.TOKEN_LIMIT ? parseInt(process.env.TOKEN_LIMIT, 10) : 0;
if (tokenLimit > 0 && totalTokensUsed > tokenLimit) {
console.warn(`Token limit (${tokenLimit}) reached. Aborting summarisation.`);
break;
}

Expand Down
18 changes: 14 additions & 4 deletions packages/db/src/articles/getArticlesMissingContent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import { prisma } from '../client';
*
* Optimizations:
* - Only fetches articles from the last 7 days (failed extractions don't retry forever)
* - Limits to 100 articles per run to prevent slow accumulation
* - Limits articles per run if MAX_CONTENT_EXTRACTION is set (set to 0 for unlimited)
* - Orders by createdAt DESC to prioritize newest articles
*
* This prevents the pipeline from retrying failed content extractions indefinitely
Expand All @@ -15,7 +15,11 @@ export async function getArticlesMissingContent() {
const sevenDaysAgo = new Date();
sevenDaysAgo.setDate(sevenDaysAgo.getDate() - 7);

return prisma.article.findMany({
const maxArticles = process.env.MAX_CONTENT_EXTRACTION
? parseInt(process.env.MAX_CONTENT_EXTRACTION, 10)
: 0; // 0 = unlimited

const query: any = {
where: {
OR: [{ content: null }, { content: '' }],
createdAt: {
Expand All @@ -25,6 +29,12 @@ export async function getArticlesMissingContent() {
orderBy: {
createdAt: 'desc',
},
take: 100,
});
};

// Only add limit if configured
if (maxArticles > 0) {
query.take = maxArticles;
}

return prisma.article.findMany(query);
}
5 changes: 5 additions & 0 deletions packages/db/src/articles/getUnclusteredArticles.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ export async function getUnclusteredArticles(daysBack: number = 3) {
gte: cutoffDate,
},
},
select: {
id: true,
embedding: true,
createdAt: true,
},
orderBy: {
createdAt: 'desc',
},
Expand Down
7 changes: 7 additions & 0 deletions packages/db/src/articles/getUnembeddedArticles.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,12 @@ import { Prisma } from '@prisma/client';
export async function getUnembeddedArticles() {
return prisma.article.findMany({
where: { embedding: { equals: Prisma.DbNull } },
select: {
id: true,
title: true,
content: true,
snippet: true,
url: true,
},
});
}
30 changes: 28 additions & 2 deletions packages/db/src/clusters/getRankedClusters.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,36 @@ describe('getRankedClusters', () => {
findMany.mockResolvedValue([]);
await getRankedClusters();
expect(findMany).toHaveBeenCalledWith({
where: {
AND: [
{ headline: { not: null } },
{ headline: { not: '' } },
{ summary: { not: null } },
{ summary: { not: '' } },
{ archived: false },
],
},
include: {
articleAssignments: {
include: {
article: { include: { sourceRel: true } },
select: {
createdAt: true,
article: {
select: {
id: true,
url: true,
title: true,
source: true,
publishedAt: true,
author: true,
sourceRel: {
select: {
id: true,
name: true,
faviconUrl: true,
},
},
},
},
},
},
},
Expand Down