-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathserver.js
More file actions
229 lines (193 loc) · 7.67 KB
/
server.js
File metadata and controls
229 lines (193 loc) · 7.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
// server.js (Manual Gemini API - No LangChain)
import "dotenv/config";
import express from "express";
import cors from "cors";
import multer from "multer";
import { createRequire } from "module";
const require = createRequire(import.meta.url);
const pdfParseMod = require("pdf-parse");
const pdfParse = typeof pdfParseMod === "function" ? pdfParseMod : (pdfParseMod?.default ?? null);
if (!pdfParse) throw new Error("pdf-parse load failed. Run `npm i pdf-parse@1.1.1`.");
import { readFile } from "fs/promises";
import { ChromaClient } from "chromadb";
import { DefaultEmbeddingFunction } from "@chroma-core/default-embed";
import { randomUUID } from "crypto";
// Manual Gemini API function (replaces LangChain)
async function generateAnswer(question, context) {
const apiKey = process.env.GOOGLE_API_KEY;
if (!apiKey) throw new Error("Missing GOOGLE_API_KEY in .env");
const model = "gemini-2.5-flash"; // Stable v1beta model as of Nov 2025
const url = `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${apiKey}`;
const promptText = `You are a helpful assistant. Answer the question using ONLY the context below.
If nothing relevant, say "I don't know based on the PDF."
Question: ${question}
Context: ${context}
Answer concisely and naturally.`;
const payload = {
contents: [{
parts: [{ text: promptText }]
}],
generationConfig: { temperature: 0.7 } // Optional: Balanced creativity
};
const res = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(payload)
});
if (!res.ok) {
const errText = await res.text();
throw new Error(`Gemini API Error: ${res.status} - ${errText}`);
}
const data = await res.json();
if (!data.candidates || !data.candidates[0]?.content?.parts[0]?.text) {
throw new Error("Invalid API response - no content generated");
}
return data.candidates[0].content.parts[0].text;
}
function chunkText(text, { chunkSize = 2000, chunkOverlap = 300 } = {}) {
console.log(' Chunk loop start:', text.length, 'chars');
const seen = new Set();
const chunks = [];
let start = 0;
let iter = 0;
while (start < text.length) {
const end = Math.min(start + chunkSize, text.length);
const piece = text.slice(start, end).trim();
const normalized = piece.toLowerCase().replace(/\s+/g, ' ').substring(0, 500);
if (piece.length >= 100 && !seen.has(normalized)) {
seen.add(normalized);
chunks.push(piece);
}
start = Math.max(0, end - chunkOverlap);
iter++;
if (iter > 1000) {
console.log(' Chunk safety exit');
break;
}
}
console.log(' Chunk loop end:', chunks.length, 'chunks');
return chunks.length > 0 ? chunks : [text.slice(0, chunkSize)];
}
const PORT = Number(process.env.PORT || 3000);
const chroma = new ChromaClient({ host: "localhost", port: 8000, ssl: false });
const embeddingFunction = new DefaultEmbeddingFunction();
const COLLECTION_NAME = `pdf_rag_${Date.now()}`;
let collection;
(async () => {
try {
const existing = await chroma.listCollections();
for (const col of existing) {
if (col.name.startsWith('pdf_rag_')) {
await chroma.deleteCollection({ name: col.name });
console.log(`Deleted old: ${col.name}`);
}
}
collection = await chroma.createCollection({ name: COLLECTION_NAME, embeddingFunction });
console.log(`New collection: ${COLLECTION_NAME}`);
} catch (err) {
console.error('Chroma init failed:', err.message);
process.exit(1);
}
})();
const app = express();
app.use(cors());
app.use(express.json());
app.use(express.static("public"));
app.get("/", async (_req, res) => {
try {
res.type("html").send(await readFile(new URL("./public/index.html", import.meta.url)));
} catch {
res.send("<h1>PDF Chatbot</h1><p>Upload PDF via UI.</p>");
}
});
app.get("/health", async (_req, res) => {
try {
const count = await collection?.count() ?? 0;
res.json({ ok: true, chunks: count, collection: COLLECTION_NAME });
} catch (err) {
res.status(500).json({ error: err.message });
}
});
const upload = multer({
storage: multer.memoryStorage(),
limits: { fileSize: 50 * 1024 * 1024 }
});
app.post("/ingest", upload.single("file"), async (req, res) => {
try {
console.log('=== INGEST START ===', req.file?.originalname);
if (!req.file || req.file.mimetype !== "application/pdf") {
return res.status(400).json({ error: "Invalid PDF file" });
}
console.log('Step 1: Parsing PDF...');
const pdfData = await pdfParse(req.file.buffer);
let fullText = (pdfData.text || "").replace(/\s*\n\s*/g, ' ').replace(/\s{2,}/g, ' ').trim();
if (!fullText) return res.status(400).json({ error: "No text extractable (try OCR for images?)" });
console.log(`Step 1 done: ${fullText.length} chars`);
console.log('Step 2: Chunking...');
const startTime = Date.now();
const chunks = chunkText(fullText);
console.log(`Step 2 done: ${Date.now() - startTime}ms, ${chunks.length} chunks`);
if (chunks.length === 0) return res.status(400).json({ error: "Failed to create chunks" });
console.log('Step 3: Saving to Chroma...');
const addStart = Date.now();
const ids = chunks.map(() => randomUUID());
const metadatas = chunks.map((_, i) => ({ file: req.file.originalname, chunk: i.toString() }));
await collection.add({
ids,
documents: chunks,
metadatas,
embeddingFunction
});
console.log(`Step 3 done: ${Date.now() - addStart}ms`);
const total = await collection.count();
console.log('=== INGEST COMPLETE ===', { chunks: chunks.length, total });
res.json({ ok: true, message: "Ingested successfully", chunks: chunks.length, total });
} catch (err) {
console.error('=== INGEST ERROR ===', err.message, err.stack);
res.status(500).json({ error: `Ingestion failed: ${err.message}` });
}
});
app.post("/ask", async (req, res) => {
try {
console.log('=== QUERY START ===', req.body.question);
const { question } = req.body;
if (!question) return res.status(400).json({ error: "Missing question" });
const count = await collection.count();
if (count === 0) {
return res.json({ ok: true, answer: "Please upload a PDF first." });
}
console.log('Step 5: Searching...');
const result = await collection.query({
queryTexts: [question],
nResults: 5,
include: ["documents", "metadatas", "distances"],
embeddingFunction
});
const docs = result.documents?.[0] ?? [];
const metas = result.metadatas?.[0] ?? [];
const dists = result.distances?.[0] ?? [];
const rows = docs.map((d, i) => ({ doc: d, meta: metas[i], dist: dists[i] })).slice(0, 3);
if (rows.length === 0) {
return res.json({ ok: true, answer: "No relevant info found in the PDF." });
}
const context = rows.map(r =>
`${r.doc.trim()}\n\n(Source: ${r.meta?.file ?? "PDF"} - Chunk ${r.meta?.chunk ?? "?"} | Similarity: ${(1 - (r.dist ?? 0)).toFixed(2)})`
).join("\n\n---\n\n");
console.log(`Steps 5-6 done: ${rows.length} chunks in context`);
console.log('Step 7: Generating answer...');
const answer = await generateAnswer(question, context); // Manual API call
console.log('Step 7 done');
res.json({
ok: true,
answer
});
} catch (err) {
console.error('=== QUERY ERROR ===', err.message);
res.status(500).json({ error: `Query failed: ${err.message}` });
}
});
app.listen(PORT, () => {
console.log(`Server: http://localhost:${PORT}`);
console.log("Chroma: docker run -d -p 8000:8000 --name chroma -v chroma_data:/chroma/chroma chromadb/chroma");
console.log("Gemini: Using manual API with gemini-2.5-flash (v1beta)");
});