diff --git a/README.md b/README.md index a0a4f18d..493a9885 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,7 @@ Create `.env` from `.env.example` and fill required values: - `DATABASE_URL` - `NEXT_PUBLIC_CLERK_PUBLISHABLE_KEY` - `CLERK_SECRET_KEY` +- `BLOB_READ_WRITE_TOKEN` (Vercel Blob read/write token) - `OPENAI_API_KEY` - `INNGEST_EVENT_KEY`, as placeholder @@ -137,6 +138,18 @@ Optional integrations: - `LANGCHAIN_TRACING_V2`, `LANGCHAIN_API_KEY`, `LANGCHAIN_PROJECT` - `DEBUG_PERF` (`1` or `true`) to enable dev perf logs for middleware and key auth/dashboard APIs +### 2.1) Configure Vercel Blob Storage + +Vercel Blob is used for storing uploaded documents. Both **public** and **private** stores are supported -- the upload logic auto-detects which mode the store uses and adapts automatically. + +1. In the Vercel dashboard, go to **Storage → Blob → Create Store**. +2. Choose either **Public** or **Private** access. Both work: + - **Public** stores produce URLs the browser can load directly (faster for previews). + - **Private** stores keep files behind authentication; the app proxies content through `/api/documents/[id]/content` and `/api/files/[id]` so previews still work. +3. Generate a **Read/Write token** for the store and add it as `BLOB_READ_WRITE_TOKEN` in your environment (`.env` locally, or Vercel Project Settings for deploys). +4. Redeploy so the token is available at build and runtime. +5. Verify: sign in to the Employer Upload page, upload a small PDF, and confirm `/api/upload-local` returns a `vercel-storage.com` URL without errors. + ### 3) Start database and apply schema ```bash diff --git a/docs/deployment.md b/docs/deployment.md index dda49d9a..4ef23d95 100644 --- a/docs/deployment.md +++ b/docs/deployment.md @@ -51,7 +51,7 @@ docker compose --env-file .env --profile dev up 1. Import repository into Vercel. 2. Configure managed PostgreSQL (Vercel Postgres, Neon, Supabase, etc.). -3. Set `DATABASE_URL` and app environment variables. +3. Set `DATABASE_URL`, `BLOB_READ_WRITE_TOKEN`, and the other app environment variables. 4. Deploy with Vercel defaults. 5. Apply schema once: @@ -65,6 +65,12 @@ Optional integrations: - LangSmith for tracing - Sidecar (deploy separately and set `SIDECAR_URL`) +### Verifying Blob uploads on Vercel + +1. After deploy, sign in to the Employer portal and open `/employer/upload`. +2. Upload any small PDF or DOCX. The `/api/upload-local` response should return a `vercel-storage.com` URL. +3. Paste that URL into a new tab. The file should download directly, confirming Blob access end to end. + ## Option 3: VPS self-hosted (Node + reverse proxy) 1. Install Node.js 18+, pnpm, Nginx, and PostgreSQL with pgvector. @@ -89,7 +95,8 @@ Optional: Run the sidecar separately and point `SIDECAR_URL` to it. | `CLERK_SECRET_KEY` | Yes | Clerk secret key | | `OPENAI_API_KEY` | Yes | OpenAI API key | | `INNGEST_EVENT_KEY` | Yes (prod) | Inngest event key for background jobs | -| `UPLOADTHING_TOKEN` | Optional | UploadThing for cloud storage | +| `BLOB_READ_WRITE_TOKEN` | Yes (Vercel) | Required for Vercel Blob uploads | +| `UPLOADTHING_TOKEN` | Optional | UploadThing legacy uploader | | `SIDECAR_URL` | Optional | Sidecar URL for reranking and Graph RAG | | `TAVILY_API_KEY` | Optional | Web search for analysis | | `AZURE_DOC_INTELLIGENCE_*` | Optional | OCR for scanned PDFs | diff --git a/drizzle/0002_vercel_blob.sql b/drizzle/0002_vercel_blob.sql new file mode 100644 index 00000000..4eac39f3 --- /dev/null +++ b/drizzle/0002_vercel_blob.sql @@ -0,0 +1,8 @@ +ALTER TABLE "file_uploads" + ADD COLUMN IF NOT EXISTS "storage_provider" varchar(64) NOT NULL DEFAULT 'database', + ADD COLUMN IF NOT EXISTS "storage_url" varchar(1024), + ADD COLUMN IF NOT EXISTS "storage_pathname" varchar(1024), + ADD COLUMN IF NOT EXISTS "blob_checksum" varchar(128); + +ALTER TABLE "file_uploads" + ALTER COLUMN "file_data" DROP NOT NULL; diff --git a/next.config.ts b/next.config.ts index 4e021d8e..2a4ca279 100644 --- a/next.config.ts +++ b/next.config.ts @@ -82,6 +82,9 @@ const config: NextConfig = { "@img/sharp-libvips-linuxmusl-x64", "@img/sharp-libvips-linux-x64", "pdf-lib", + "jszip", + "readable-stream", + "mammoth", ], }; diff --git a/package.json b/package.json index 567dfd7c..d2e85691 100644 --- a/package.json +++ b/package.json @@ -72,6 +72,7 @@ "@tiptap/starter-kit": "^3.20.0", "@uploadthing/react": "^7.3.3", "@vercel/analytics": "^1.6.1", + "@vercel/blob": "^2.3.0", "cheerio": "^1.2.0", "class-variance-authority": "^0.7.1", "clsx": "*", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ca0e2f55..ad8624ef 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -158,6 +158,9 @@ importers: '@vercel/analytics': specifier: ^1.6.1 version: 1.6.1(next@15.5.7(@babel/core@7.28.5)(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@18.3.1(react@18.3.1))(react@18.3.1))(react@18.3.1) + '@vercel/blob': + specifier: ^2.3.0 + version: 2.3.0 cheerio: specifier: ^1.2.0 version: 1.2.0 @@ -4418,6 +4421,10 @@ packages: vue-router: optional: true + '@vercel/blob@2.3.0': + resolution: {integrity: sha512-oYWiJbWRQ7gz9Mj0X/NHFJ3OcLMOBzq/2b3j6zeNrQmtFo6dHwU8FAwNpxVIYddVMd+g8eqEi7iRueYx8FtM0Q==} + engines: {node: '>=20.0.0'} + '@xmldom/xmldom@0.8.11': resolution: {integrity: sha512-cQzWCtO6C8TQiYl1ruKNn2U6Ao4o4WBBcbL61yJl84x+j5sOWWFU9X7DpND8XZG3daDppSsigMdfAIl2upQBRw==} engines: {node: '>=10.0.0'} @@ -4559,6 +4566,9 @@ packages: resolution: {integrity: sha512-hsU18Ae8CDTR6Kgu9DYf0EbCr/a5iGL0rytQDobUcdpYOKokk8LEjVphnXkDkgpi0wYVsqrXuP0bZxJaTqdgoA==} engines: {node: '>= 0.4'} + async-retry@1.3.3: + resolution: {integrity: sha512-wfr/jstw9xNi/0teMHrRW7dsz3Lt5ARhYNZ2ewpadnhaIp5mbALhOAP+EAdsC7t4Z6wqsDVv9+W6gm1Dk9mEyw==} + asynckit@0.4.0: resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==} @@ -6037,6 +6047,10 @@ packages: resolution: {integrity: sha512-wa56o2/ElJMYqjCjGkXri7it5FbebW5usLw/nPmCMs5DeZ7eziSYZhSmPRn0txqeW4LnAmQQU7FgqLpsEFKM4A==} engines: {node: '>= 0.4'} + is-buffer@2.0.5: + resolution: {integrity: sha512-i2R6zNFDwgEHJyQUtJEk0XFi1i0dPFn/oqjK3/vPCcDeJvW5NQ83V8QbicfF1SupOaB0h8ntgBC2YiE7dfyctQ==} + engines: {node: '>=4'} + is-bun-module@2.0.0: resolution: {integrity: sha512-gNCGbnnnnFAUGKeZ9PdbyeGYJqewpmc2aKHUEMO5nQPWU9lOmv7jcmQIv+qHD8fXW6W7qfuCwX4rY9LNRjXrkQ==} @@ -6094,6 +6108,9 @@ packages: resolution: {integrity: sha512-5KoIu2Ngpyek75jXodFvnafB6DJgr3u8uuK0LEZJjrU19DrMD3EVERaR8sjz8CCGgpZvxPl9SuE1GMVPFHx1mw==} engines: {node: '>= 0.4'} + is-node-process@1.2.0: + resolution: {integrity: sha512-Vg4o6/fqPxIjtxgUH5QLJhwZ7gW5diGCVlXpuUfELC62CuxM1iHcRe51f2W1FDy04Ai4KJkagKjx3XaqyfRKXw==} + is-number-object@1.1.1: resolution: {integrity: sha512-lZhclumE1G6VYD8VHe35wFaIif+CTy5SJIi5+3y4psDgWu4wPDoBhF8NxUOinEc7pHgiTsT6MaBb92rKhhD+Xw==} engines: {node: '>= 0.4'} @@ -8106,6 +8123,10 @@ packages: thenify@3.3.1: resolution: {integrity: sha512-RVZSIV5IG10Hk3enotrhvz0T9em6cyHBLkH/YAZuKqd8hRkKhSfCGIcP2KUY0EPxndzANBmNllzWPwak+bheSw==} + throttleit@2.1.0: + resolution: {integrity: sha512-nt6AMGKW1p/70DF/hGBdJB57B8Tspmbp5gfJ8ilhLnt7kkr2ye7hzD6NVG8GGErk2HWF34igrL2CXmNIkzKqKw==} + engines: {node: '>=18'} + tiny-invariant@1.3.3: resolution: {integrity: sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==} @@ -8253,6 +8274,10 @@ packages: undici-types@7.10.0: resolution: {integrity: sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag==} + undici@6.23.0: + resolution: {integrity: sha512-VfQPToRA5FZs/qJxLIinmU59u0r7LXqoJkCzinq3ckNJp3vKEh7jTWN589YQ5+aoAC/TGRLyJLCPKcLQbM8r9g==} + engines: {node: '>=18.17'} + undici@7.22.0: resolution: {integrity: sha512-RqslV2Us5BrllB+JeiZnK4peryVTndy9Dnqq62S3yYRRTj0tFQCwEniUy2167skdGOy3vqRzEvl1Dm4sV2ReDg==} engines: {node: '>=20.18.1'} @@ -12624,6 +12649,14 @@ snapshots: next: 15.5.7(@babel/core@7.28.5)(@opentelemetry/api@1.9.0)(@playwright/test@1.55.0)(react-dom@18.3.1(react@18.3.1))(react@18.3.1) react: 18.3.1 + '@vercel/blob@2.3.0': + dependencies: + async-retry: 1.3.3 + is-buffer: 2.0.5 + is-node-process: 1.2.0 + throttleit: 2.1.0 + undici: 6.23.0 + '@xmldom/xmldom@0.8.11': {} abort-controller-x@0.4.3: {} @@ -12775,6 +12808,10 @@ snapshots: async-function@1.0.0: {} + async-retry@1.3.3: + dependencies: + retry: 0.13.1 + asynckit@0.4.0: {} autoprefixer@10.4.21(postcss@8.5.6): @@ -14461,6 +14498,8 @@ snapshots: call-bound: 1.0.4 has-tostringtag: 1.0.2 + is-buffer@2.0.5: {} + is-bun-module@2.0.0: dependencies: semver: 7.7.2 @@ -14511,6 +14550,8 @@ snapshots: is-negative-zero@2.0.3: {} + is-node-process@1.2.0: {} + is-number-object@1.1.1: dependencies: call-bound: 1.0.4 @@ -17106,6 +17147,8 @@ snapshots: dependencies: any-promise: 1.3.0 + throttleit@2.1.0: {} + tiny-invariant@1.3.3: {} tinyglobby@0.2.14: @@ -17252,6 +17295,8 @@ snapshots: undici-types@7.10.0: {} + undici@6.23.0: {} + undici@7.22.0: {} unicode-canonical-property-names-ecmascript@2.0.1: {} diff --git a/scripts/ensure-pgvector.mjs b/scripts/ensure-pgvector.mjs index 2e29f08c..d77d7b23 100644 --- a/scripts/ensure-pgvector.mjs +++ b/scripts/ensure-pgvector.mjs @@ -1,4 +1,5 @@ -import 'dotenv/config'; +import dotenv from "dotenv"; +dotenv.config(); import postgres from "postgres"; const url = process.env.DATABASE_URL; diff --git a/scripts/test-trend-search.ts b/scripts/test-trend-search.ts index d63c02c1..791cd048 100644 --- a/scripts/test-trend-search.ts +++ b/scripts/test-trend-search.ts @@ -75,7 +75,8 @@ Running pipeline (plan → search → synthesize)… } */ -import "dotenv/config"; +import dotenv from "dotenv"; +dotenv.config(); // Skip the full env validation so we don't need DB/Clerk/Inngest keys process.env.SKIP_ENV_VALIDATION = "true"; diff --git a/src/app/api/documents/[id]/content/route.ts b/src/app/api/documents/[id]/content/route.ts new file mode 100644 index 00000000..506dcdbc --- /dev/null +++ b/src/app/api/documents/[id]/content/route.ts @@ -0,0 +1,91 @@ +import { NextResponse } from "next/server"; +import { eq } from "drizzle-orm"; +import { auth } from "@clerk/nextjs/server"; +import { db } from "~/server/db"; +import { document } from "~/server/db/schema"; +import { isPrivateBlobUrl, fetchBlob } from "~/server/storage/vercel-blob"; + +const EXTENSION_TO_MIME: Record = { + ".pdf": "application/pdf", + ".png": "image/png", + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".gif": "image/gif", + ".webp": "image/webp", + ".tiff": "image/tiff", + ".tif": "image/tiff", + ".bmp": "image/bmp", + ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ".txt": "text/plain", + ".csv": "text/csv", + ".html": "text/html", + ".md": "text/markdown", +}; + +function inferMime(name: string): string { + const match = /(\.[a-z0-9]+)(?:\?|#|$)/i.exec(name); + return (match?.[1] && EXTENSION_TO_MIME[match[1].toLowerCase()]) ?? "application/octet-stream"; +} + +interface RouteParams { + params: Promise<{ id: string }>; +} + +export async function GET(_request: Request, { params }: RouteParams) { + try { + const { userId } = await auth(); + if (!userId) { + return NextResponse.json({ error: "Unauthorized" }, { status: 401 }); + } + + const { id } = await params; + const docId = parseInt(id, 10); + if (isNaN(docId)) { + return NextResponse.json({ error: "Invalid document ID" }, { status: 400 }); + } + + const [doc] = await db + .select({ url: document.url, title: document.title }) + .from(document) + .where(eq(document.id, docId)); + + if (!doc) { + return NextResponse.json({ error: "Document not found" }, { status: 404 }); + } + + if (!isPrivateBlobUrl(doc.url)) { + return NextResponse.redirect(doc.url, { status: 307 }); + } + + const blobRes = await fetchBlob(doc.url); + if (!blobRes.ok) { + return NextResponse.json( + { error: "Failed to retrieve document from storage" }, + { status: 502 }, + ); + } + + const mimeType = + blobRes.headers.get("content-type") ?? inferMime(doc.title); + + return new NextResponse(blobRes.body, { + status: 200, + headers: { + "Content-Type": mimeType, + ...(blobRes.headers.get("content-length") + ? { "Content-Length": blobRes.headers.get("content-length")! } + : {}), + "Content-Disposition": `inline; filename="${encodeURIComponent(doc.title)}"; filename*=UTF-8''${encodeURIComponent(doc.title)}`, + "Cache-Control": "private, max-age=3600", + }, + }); + } catch (error) { + console.error("Error serving document content:", error); + return NextResponse.json( + { error: "Failed to serve document", details: error instanceof Error ? error.message : "Unknown error" }, + { status: 500 }, + ); + } +} diff --git a/src/app/api/fetchDocument/route.ts b/src/app/api/fetchDocument/route.ts index 84d0ee70..9330bfb8 100644 --- a/src/app/api/fetchDocument/route.ts +++ b/src/app/api/fetchDocument/route.ts @@ -4,6 +4,7 @@ import { document, users, fileUploads } from "../../../server/db/schema/base"; import { eq, inArray } from "drizzle-orm"; import { validateRequestBody, UserIdSchema } from "~/lib/validation"; import { auth } from '@clerk/nextjs/server'; +import { isPrivateBlobUrl } from "~/server/storage/vercel-blob"; /** Extract file id from /api/files/{id} URL so we can look up mimeType from file_uploads */ const FILE_API_ID_REGEX = /\/api\/files\/(\d+)/; @@ -104,8 +105,14 @@ export async function POST(request: Request) { const mimeType = mimeFromFile ?? inferMimeFromName(doc.title) ?? inferMimeFromName(doc.url); + + const url = isPrivateBlobUrl(doc.url) + ? `/api/documents/${Number(doc.id)}/content` + : doc.url; + return { ...doc, + url, id: Number(doc.id), companyId: Number(doc.companyId), ...(mimeType && { mimeType }), diff --git a/src/app/api/files/[id]/route.ts b/src/app/api/files/[id]/route.ts index a6f6cb3f..a8c98684 100644 --- a/src/app/api/files/[id]/route.ts +++ b/src/app/api/files/[id]/route.ts @@ -7,6 +7,7 @@ import { NextResponse } from "next/server"; import { eq } from "drizzle-orm"; import { db } from "~/server/db"; import { fileUploads } from "~/server/db/schema"; +import { isPrivateBlobUrl, fetchBlob } from "~/server/storage/vercel-blob"; const MIME_BY_EXTENSION: Record = { pdf: "application/pdf", @@ -71,6 +72,43 @@ export async function GET( ); } + if (file.storageProvider === "vercel_blob" && file.storageUrl) { + if (isPrivateBlobUrl(file.storageUrl)) { + const blobRes = await fetchBlob(file.storageUrl); + if (!blobRes.ok) { + return NextResponse.json( + { error: "Failed to retrieve file from storage" }, + { status: 502 } + ); + } + const mimeType = + blobRes.headers.get("content-type") ?? + file.mimeType?.trim() ?? + inferMimeTypeFromFilename(file.filename); + return new NextResponse(blobRes.body, { + status: 200, + headers: { + "Content-Type": mimeType, + ...(blobRes.headers.get("content-length") + ? { "Content-Length": blobRes.headers.get("content-length")! } + : {}), + "Content-Disposition": `inline; filename="${encodeURIComponent(file.filename)}"; filename*=UTF-8''${encodeURIComponent(file.filename)}`, + "Cache-Control": "private, max-age=31536000", + }, + }); + } + return NextResponse.redirect(file.storageUrl, { + status: 307, + }); + } + + if (!file.fileData) { + return NextResponse.json( + { error: "File is not available in database storage" }, + { status: 404 } + ); + } + // Decode base64 data back to binary const binaryData = Buffer.from(file.fileData, "base64"); const mimeType = file.mimeType?.trim() || inferMimeTypeFromFilename(file.filename); diff --git a/src/app/api/upload-local/route.ts b/src/app/api/upload-local/route.ts index 3ff066e2..37fcf088 100644 --- a/src/app/api/upload-local/route.ts +++ b/src/app/api/upload-local/route.ts @@ -7,6 +7,7 @@ import { NextResponse } from "next/server"; import { auth } from "@clerk/nextjs/server"; import { db } from "~/server/db"; import { fileUploads } from "~/server/db/schema"; +import { putFile } from "~/server/storage/vercel-blob"; import { isUploadAccepted } from "~/lib/upload-accepted"; const MAX_FILE_SIZE = 16 * 1024 * 1024; // 16MB to match UploadThing config @@ -39,10 +40,6 @@ export async function POST(request: Request) { ); } - console.log( - `[UploadLocal] Received file: name=${file.name}, mime=${file.type}, size=${(file.size / 1024).toFixed(1)}KB, user=${userId}` - ); - if (!isUploadAccepted({ name: file.name, type: file.type })) { console.warn(`[UploadLocal] Rejected: unsupported file type name=${file.name}, mime=${file.type}`); return NextResponse.json( @@ -51,33 +48,46 @@ export async function POST(request: Request) { ); } + console.log( + `[UploadLocal] Uploading to Vercel Blob: name=${file.name}, mime=${file.type}, size=${(file.size / 1024).toFixed(1)}KB, user=${userId}` + ); + if (file.size > MAX_FILE_SIZE) { - console.warn(`[UploadLocal] Rejected: file too large size=${(file.size / 1024 / 1024).toFixed(1)}MB, max=${MAX_FILE_SIZE / 1024 / 1024}MB`); + console.warn( + `[UploadLocal] Rejected: file too large size=${(file.size / 1024 / 1024).toFixed(1)}MB, max=${MAX_FILE_SIZE / 1024 / 1024}MB` + ); return NextResponse.json( { error: `File too large. Maximum size is ${MAX_FILE_SIZE / 1024 / 1024}MB.` }, { status: 400 } ); } - // Convert file to base64 - console.log(`[UploadLocal] Converting to base64...`); - const arrayBuffer = await file.arrayBuffer(); - const base64Data = Buffer.from(arrayBuffer).toString("base64"); - console.log(`[UploadLocal] Base64 encoded: ${(base64Data.length / 1024).toFixed(1)}KB`); - - // Store in database - console.log(`[UploadLocal] Storing in database...`); - const [uploadedFile] = await db.insert(fileUploads).values({ - userId, + const blob = await putFile({ filename: file.name, - mimeType: file.type, - fileData: base64Data, - fileSize: file.size, - }).returning({ - id: fileUploads.id, - filename: fileUploads.filename, + data: await file.arrayBuffer(), + contentType: file.type || undefined, }); + const [uploadedFile] = await db + .insert(fileUploads) + .values({ + userId, + filename: file.name, + mimeType: file.type, + fileData: null, + fileSize: file.size, + storageProvider: "vercel_blob", + storageUrl: blob.url, + storagePathname: blob.pathname, + blobChecksum: blob.checksum ?? null, + }) + .returning({ + id: fileUploads.id, + filename: fileUploads.filename, + storageProvider: fileUploads.storageProvider, + storageUrl: fileUploads.storageUrl, + }); + if (!uploadedFile) { console.error("[UploadLocal] Database insert returned no result"); return NextResponse.json( @@ -87,7 +97,7 @@ export async function POST(request: Request) { } // Return URL that can be used to fetch the file - const fileUrl = `/api/files/${uploadedFile.id}`; + const fileUrl = blob.url; const elapsed = Date.now() - uploadStart; console.log( @@ -99,6 +109,8 @@ export async function POST(request: Request) { url: fileUrl, name: uploadedFile.filename, id: uploadedFile.id, + provider: uploadedFile.storageProvider, + pathname: blob.pathname, }); } catch (error) { const elapsed = Date.now() - uploadStart; diff --git a/src/app/employer/documents/components/DocxViewer.tsx b/src/app/employer/documents/components/DocxViewer.tsx index 941fc063..b4efee0c 100644 --- a/src/app/employer/documents/components/DocxViewer.tsx +++ b/src/app/employer/documents/components/DocxViewer.tsx @@ -1,133 +1,35 @@ "use client"; -import React, { useEffect, useState, useRef } from "react"; -import { Loader2, AlertTriangle, RotateCw } from "lucide-react"; +import React from "react"; +import { FileText, Download } from "lucide-react"; interface DocxViewerProps { url: string; title: string; } -/** - * Client-side DOCX viewer that converts .docx files to HTML using mammoth.js. - * Fetches the binary, converts to HTML, and renders in a sandboxed container. - */ -export function DocxViewer({ url, title: _title }: DocxViewerProps) { - const [html, setHtml] = useState(""); - const [loading, setLoading] = useState(true); - const [error, setError] = useState(null); - const containerRef = useRef(null); - - const loadDocument = async () => { - setLoading(true); - setError(null); - setHtml(""); - - try { - // Fetch the raw binary - const response = await fetch(url); - if (!response.ok) throw new Error(`Failed to fetch document (${response.status})`); - - const arrayBuffer = await response.arrayBuffer(); - - // Dynamically import mammoth for client-side use - const mammoth = await import("mammoth"); - const result = await mammoth.convertToHtml({ arrayBuffer }); - - if (result.messages.length > 0) { - console.warn("[DocxViewer] Mammoth warnings:", result.messages); - } - - setHtml(result.value); - } catch (err) { - console.error("[DocxViewer] Error converting DOCX:", err); - setError(err instanceof Error ? err.message : "Failed to render document"); - } finally { - setLoading(false); - } - }; - - useEffect(() => { - void loadDocument(); - // eslint-disable-next-line react-hooks/exhaustive-deps - }, [url]); - - if (loading) { - return ( -
- -

Converting document...

+export function DocxViewer({ url, title }: DocxViewerProps) { + return ( +
+
+
- ); - } - - if (error) { - return ( -
-
- -
-
-

Failed to render document

-

{error}

- -
+
+

DOCX preview unavailable

+

+ Download {title} to view the full document. +

- ); - } - - return ( -
- {/* Scoped styles for the converted HTML */} - - ); } diff --git a/src/app/employer/upload/UploadForm.tsx b/src/app/employer/upload/UploadForm.tsx index 1edeabdc..4c0e5030 100644 --- a/src/app/employer/upload/UploadForm.tsx +++ b/src/app/employer/upload/UploadForm.tsx @@ -56,29 +56,6 @@ const ZIP_MIME_TYPES = new Set([ "multipart/x-zip", "application/octet-stream", ]); -const MIME_BY_EXTENSION: Record = { - pdf: "application/pdf", - png: "image/png", - jpg: "image/jpeg", - jpeg: "image/jpeg", - tif: "image/tiff", - tiff: "image/tiff", - webp: "image/webp", - gif: "image/gif", - bmp: "image/bmp", - docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - doc: "application/msword", - xlsx: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - xls: "application/vnd.ms-excel", - csv: "text/csv", - pptx: "application/vnd.openxmlformats-officedocument.presentationml.presentation", - ppt: "application/vnd.ms-powerpoint", - txt: "text/plain", - md: "text/markdown", - markdown: "text/markdown", - html: "text/html", - htm: "text/html", -}; interface DocumentFile { id: string; @@ -100,12 +77,6 @@ interface BatchSettings { storageMethod: string; } -interface ZipExtractionResult { - files: File[]; - nestedZipCount: number; - metadataFileCount: number; -} - type DataTransferItemWithWebkitEntry = DataTransferItem & { webkitGetAsEntry?: () => FileSystemEntry | null; }; @@ -196,111 +167,39 @@ const UploadForm: React.FC = ({ return extension === "zip" || ZIP_MIME_TYPES.has(mimeType); }, []); - const inferMimeTypeFromFilename = useCallback((filename: string) => { - const extension = filename.toLowerCase().split(".").pop() ?? ""; - return MIME_BY_EXTENSION[extension]; - }, []); - - const extractFilesFromZip = useCallback(async (zipFile: File): Promise => { - const JSZip = (await import("jszip")).default; - const zip = await JSZip.loadAsync(await zipFile.arrayBuffer()); - - const extractedFiles: File[] = []; - let nestedZipCount = 0; - let metadataFileCount = 0; - - for (const [entryPath, entry] of Object.entries(zip.files)) { - if (entry.dir) continue; - - // Skip macOS ZIP metadata/resource fork entries. - const normalizedEntryPath = entryPath.replaceAll("\\", "/"); - const pathParts = normalizedEntryPath.split("/").filter(Boolean); - if (pathParts.length === 0) continue; - const safeName = pathParts[pathParts.length - 1]!; - if ( - pathParts.includes("__MACOSX") || - safeName.startsWith("._") || - safeName === ".DS_Store" - ) { - metadataFileCount++; - continue; - } - - if (safeName.toLowerCase().endsWith(".zip")) { - nestedZipCount++; - continue; - } - - const blob = await entry.async("blob"); - const inferredMime = inferMimeTypeFromFilename(safeName); - extractedFiles.push( - new File([blob], safeName, { - type: inferredMime ?? blob.type ?? "", - lastModified: zipFile.lastModified, - }), - ); - } - - return { files: extractedFiles, nestedZipCount, metadataFileCount }; - }, [inferMimeTypeFromFilename]); - const validateAndAddFiles = useCallback( async (files: File[]) => { const validFiles: DocumentFile[] = []; - const filesToValidate: { file: File; fromZip: boolean }[] = []; - let nonZipErrorCount = 0; let zipArchiveCount = 0; - let zipExtractionFailureCount = 0; - let zipNestedSkippedCount = 0; - let zipMetadataSkippedCount = 0; - let zipUnsupportedCount = 0; let zipOversizedCount = 0; let zipQueuedCount = 0; for (const file of files) { - if (!isZipFile(file)) { - filesToValidate.push({ file, fromZip: false }); + if (isZipFile(file)) { + zipArchiveCount++; + if (file.size > MAX_FILE_SIZE) { + zipOversizedCount++; + continue; + } + validFiles.push(defaultDoc(file)); + zipQueuedCount++; continue; } - zipArchiveCount++; - try { - const extracted = await extractFilesFromZip(file); - zipNestedSkippedCount += extracted.nestedZipCount; - zipMetadataSkippedCount += extracted.metadataFileCount; - extracted.files.forEach((extractedFile) => { - filesToValidate.push({ file: extractedFile, fromZip: true }); - }); - } catch (error) { - zipExtractionFailureCount++; - console.error("Failed to extract ZIP file", error); - toast.error(`Failed to extract ${file.name}`); - } - } - - filesToValidate.forEach(({ file, fromZip }) => { if (!isUploadAccepted({ name: file.name, type: file.type })) { - if (fromZip) { - zipUnsupportedCount++; - } else { - nonZipErrorCount++; - } - return; + nonZipErrorCount++; + continue; } + if (file.size > MAX_FILE_SIZE) { - if (fromZip) { - zipOversizedCount++; - } else { - toast.error(`${file.name} exceeds 16MB limit`); - nonZipErrorCount++; - } - return; + toast.error(`${file.name} exceeds 16MB limit`); + nonZipErrorCount++; + continue; } validFiles.push(defaultDoc(file)); - if (fromZip) zipQueuedCount++; - }); + } if (nonZipErrorCount > 0) { toast.error(`${nonZipErrorCount} file(s) were rejected`, { @@ -309,18 +208,14 @@ const UploadForm: React.FC = ({ } if (zipArchiveCount > 0) { - const details: string[] = [`${zipQueuedCount} file(s) added from ZIP`]; - if (zipUnsupportedCount > 0) details.push(`${zipUnsupportedCount} unsupported`); - if (zipOversizedCount > 0) details.push(`${zipOversizedCount} over 16MB`); - if (zipNestedSkippedCount > 0) details.push(`${zipNestedSkippedCount} nested ZIP skipped`); - if (zipMetadataSkippedCount > 0) details.push(`${zipMetadataSkippedCount} system file(s) ignored`); - if (zipExtractionFailureCount > 0) details.push(`${zipExtractionFailureCount} ZIP failed`); - if (zipQueuedCount > 0) { - toast.success("ZIP extraction complete", { description: details.join(" • ") }); - } else { - toast.error("No extractable files were found in ZIP", { - description: details.join(" • "), + toast.success("ZIP archive queued", { + description: "Archive uploads as-is; expansion occurs server-side after upload.", + }); + } + if (zipOversizedCount > 0) { + toast.error("ZIP archive rejected", { + description: `${zipOversizedCount} ZIP file(s) exceeded the 16MB limit.`, }); } } @@ -335,7 +230,7 @@ const UploadForm: React.FC = ({ }); } }, - [defaultDoc, extractFilesFromZip, isZipFile], + [defaultDoc, isZipFile], ); const handleFileSelect = useCallback( @@ -565,12 +460,12 @@ const UploadForm: React.FC = ({ const uploadSingleDocument = async (doc: DocumentFile) => { updateDocument(doc.id, { status: "uploading", progress: 10 }); - const storageType = - doc.storageMethod === "cloud" && isUploadThingConfigured ? "cloud" : "database"; + const useUploadThingForDoc = doc.storageMethod === "cloud" && isUploadThingConfigured; + let resolvedStorageType: "cloud" | "database" = "cloud"; let fileUrl: string; const mimeType: string | undefined = doc.file.type || undefined; - if (storageType === "cloud") { + if (useUploadThingForDoc) { updateDocument(doc.id, { progress: 30 }); const res = await uploadFiles("documentUploaderRestricted", { files: [doc.file], @@ -586,8 +481,11 @@ const UploadForm: React.FC = ({ const err = (await res.json()) as { error?: string }; throw new Error(err.error ?? "Local upload failed"); } - const data = (await res.json()) as { url: string }; + const data = (await res.json()) as { url: string; provider?: string }; fileUrl = data.url; + if (data.provider !== "vercel_blob") { + resolvedStorageType = "database"; + } } updateDocument(doc.id, { progress: 60 }); @@ -605,7 +503,7 @@ const UploadForm: React.FC = ({ documentName: doc.title, category: doc.category, documentUrl: fileUrl, - storageType, + storageType: resolvedStorageType, mimeType, preferredProvider: preferredProvider === "LANDING_AI" ? "LANDING_AI" : preferredProvider, diff --git a/src/env.ts b/src/env.ts index 4d64ce2e..c2db2681 100644 --- a/src/env.ts +++ b/src/env.ts @@ -13,6 +13,7 @@ const serverSchema = z.object({ DATABASE_URL: z.preprocess(normalize, z.string().url()), OPENAI_API_KEY: requiredString(), CLERK_SECRET_KEY: requiredString(), + BLOB_READ_WRITE_TOKEN: optionalString(), UPLOADTHING_TOKEN: optionalString(), DATALAB_API_KEY: optionalString(), // Web search providers @@ -83,6 +84,7 @@ function parseServerEnv() { DATABASE_URL: process.env.DATABASE_URL, OPENAI_API_KEY: process.env.OPENAI_API_KEY, CLERK_SECRET_KEY: process.env.CLERK_SECRET_KEY, + BLOB_READ_WRITE_TOKEN: process.env.BLOB_READ_WRITE_TOKEN, UPLOADTHING_TOKEN: process.env.UPLOADTHING_TOKEN, DATALAB_API_KEY: process.env.DATALAB_API_KEY, TAVILY_API_KEY: process.env.TAVILY_API_KEY, diff --git a/src/lib/ocr/adapters/azureAdapter.ts b/src/lib/ocr/adapters/azureAdapter.ts index 6aee2ae3..54368aec 100644 --- a/src/lib/ocr/adapters/azureAdapter.ts +++ b/src/lib/ocr/adapters/azureAdapter.ts @@ -11,6 +11,7 @@ import type { ExtractedTable, OCRProvider, } from "../types"; +import { fetchBlob } from "~/server/storage/vercel-blob"; /** * Azure Document Intelligence API response types @@ -219,7 +220,7 @@ export class AzureDocumentIntelligenceAdapter implements OCRAdapter { const fullUrl = queryParams.toString() ? `${url}&${queryParams}` : url; // Fetch document server-side and send as binary. Azure cannot reach localhost/private URLs. - const docResponse = await fetch(documentUrl); + const docResponse = await fetchBlob(documentUrl); if (!docResponse.ok) { throw new Error( `Failed to fetch document from ${documentUrl}: ${docResponse.status} ${docResponse.statusText}` diff --git a/src/lib/ocr/adapters/landingAdapter.ts b/src/lib/ocr/adapters/landingAdapter.ts index ace1296b..c32e37e1 100644 --- a/src/lib/ocr/adapters/landingAdapter.ts +++ b/src/lib/ocr/adapters/landingAdapter.ts @@ -13,6 +13,7 @@ import type { PageContent, OCRProvider, } from "../types"; +import { fetchBlob } from "~/server/storage/vercel-blob"; /** ADE Parse API base URL (use api.va.eu-west-1.landing.ai for EU) */ const ADE_BASE_URL = "https://api.va.landing.ai"; @@ -77,7 +78,7 @@ export class LandingAIAdapter implements OCRAdapter { // Fetch document server-side and send as binary. Landing AI's servers cannot reach // localhost or private URLs (e.g. /api/files/7), so we must upload the file ourselves. - const docResponse = await fetch(documentUrl); + const docResponse = await fetchBlob(documentUrl); if (!docResponse.ok) { throw new Error( `Failed to fetch document from ${documentUrl}: ${docResponse.status} ${docResponse.statusText}` diff --git a/src/lib/ocr/complexity.ts b/src/lib/ocr/complexity.ts index ea8e6768..5fd258df 100644 --- a/src/lib/ocr/complexity.ts +++ b/src/lib/ocr/complexity.ts @@ -1,6 +1,7 @@ import type { OCRProvider } from "~/lib/ocr/types"; import type { ClassificationResult } from "@huggingface/transformers"; import { PDFDocument } from "pdf-lib"; +import { fetchBlob } from "~/server/storage/vercel-blob"; const SAMPLING_CONFIG = { MIN_PAGES_TO_SAMPLE: 3, @@ -131,7 +132,7 @@ function getDefaultOCRProvider(): OCRProvider { } export async function determineDocumentRouting(documentUrl: string): Promise { - const response = await fetch(documentUrl); + const response = await fetchBlob(documentUrl); const buffer = await response.arrayBuffer(); let pageCount = 0; diff --git a/src/lib/ocr/processor.ts b/src/lib/ocr/processor.ts index d6894349..c18652cb 100644 --- a/src/lib/ocr/processor.ts +++ b/src/lib/ocr/processor.ts @@ -21,6 +21,7 @@ import { } from "~/server/db/schema"; import { eq, sql } from "drizzle-orm"; import crypto from "crypto"; +import { fetchBlob } from "~/server/storage/vercel-blob"; import type { ProcessDocumentEventData, @@ -183,7 +184,7 @@ export async function routeDocument( async function getPageCount(documentUrl: string): Promise { try { const { PDFDocument } = await import("pdf-lib"); - const response = await fetch(documentUrl); + const response = await fetchBlob(documentUrl); const buffer = await response.arrayBuffer(); const doc = await PDFDocument.load(buffer, { ignoreEncryption: true }); return doc.getPageCount(); @@ -238,7 +239,7 @@ export async function normalizeDocument( // 3. Explicitly requested via options (future) const isPdf = documentUrl.toLowerCase().endsWith(".pdf") || - (await fetch(documentUrl, { method: "HEAD" }).then(r => r.headers.get("content-type") === "application/pdf").catch(() => false)); + (await fetchBlob(documentUrl, { method: "HEAD" }).then(r => r.headers.get("content-type") === "application/pdf").catch(() => false)); if (isPdf && process.env.OPENAI_API_KEY) { const isComplex = routerDecision.visionLabel @@ -250,7 +251,7 @@ export async function normalizeDocument( if (isComplex || isLowConfidence) { console.log(`[Enrichment] Triggering VLM enrichment (Complex=${isComplex}, LowConf=${isLowConfidence})`); try { - const response = await fetch(documentUrl); + const response = await fetchBlob(documentUrl); const buffer = await response.arrayBuffer(); // Identify pages to enrich (all for now, or sample? Plan says "Complex" or "Low Conf"). @@ -801,7 +802,7 @@ export async function processNativePDF(documentUrl: string): Promise { + const token = getBlobToken(); + const safeName = sanitizeFilename(filename); + const key = `documents/${randomUUID()}-${safeName.length > 0 ? safeName : "upload"}`; + + const body = Buffer.from(data instanceof ArrayBuffer ? new Uint8Array(data) : data); + + const tryPut = (access: "public" | "private") => + put(key, body, { access, contentType, token }); + + let blob: PutBlobResult; + if (detectedAccess) { + blob = await tryPut(detectedAccess); + } else { + try { + blob = await tryPut("public"); + detectedAccess = "public"; + } catch (err) { + if (err instanceof Error && err.message.includes("private store")) { + blob = await tryPut("private"); + detectedAccess = "private"; + } else { + throw err; + } + } + } + + const extended = blob as PutBlobResult & { contentHash?: string | null }; + + return { + url: blob.url, + pathname: blob.pathname, + contentType: blob.contentType, + checksum: extended.contentHash ?? null, + }; +} + +export function isPrivateBlobUrl(url: string): boolean { + return url.includes(".private.blob."); +} + +export async function fetchBlob(url: string, init?: RequestInit): Promise { + if (isPrivateBlobUrl(url)) { + const token = getBlobToken(); + return fetch(url, { + ...init, + headers: { + ...(init?.headers as Record | undefined), + Authorization: `Bearer ${token}`, + }, + }); + } + return fetch(url, init); +} + +function sanitizeFilename(filename: string): string { + return filename.replace(/\s+/g, "-").replace(/[^a-zA-Z0-9.\-_]/g, ""); +}