From 3542fc384e0b1e078ef8c8dc263471df9cb6e35b Mon Sep 17 00:00:00 2001 From: Nico Albanese Date: Thu, 3 Apr 2025 16:46:40 +0100 Subject: [PATCH 1/4] docs: add local caching middleware recipe --- .../05-node/80-local-caching-middleware.mdx | 255 ++++++++++++++++++ 1 file changed, 255 insertions(+) create mode 100644 content/cookbook/05-node/80-local-caching-middleware.mdx diff --git a/content/cookbook/05-node/80-local-caching-middleware.mdx b/content/cookbook/05-node/80-local-caching-middleware.mdx new file mode 100644 index 000000000000..522a1781abfd --- /dev/null +++ b/content/cookbook/05-node/80-local-caching-middleware.mdx @@ -0,0 +1,255 @@ +--- +title: Local Caching Middleware +description: Learn how to create a caching middleware for local development. +tags: ['streaming', 'caching', 'middleware'] +--- + +# Local Caching Middleware + +When developing AI applications, you'll often find yourself repeatedly making the same API calls during development. This can lead to increased costs and slower development cycles. A caching middleware allows you to store responses locally and reuse them when the same inputs are provided. + +This approach is particularly useful in two scenarios: + +1. **Iterating on UI/UX** - When you're focused on styling and user experience, you don't want to regenerate AI responses for every code change. +2. **Working on evals** - When developing evals, you need to repeatedly test the same prompts, but don't need new generations each time. + +## Implementation + +In this implementation, you create a JSON file to store responses. When a request is made, you first check if you have already seen this exact request. If you have, you return the cached response immediately (as a one-off generation or chunks of tokens). If not, you trigger the generation, save the response, and return it. + +### Key Components: + +- **Cache Storage**: Uses a JSON file (`.cache/ai-cache.json` can be anyhwere you want) to persist responses between app restarts +- **Middleware Functions**: Intercepts both single-response (`wrapGenerate`) and streaming (`wrapStream`) API calls +- **Request Fingerprinting**: Creates unique keys based on the prompt and parameters + + + Make sure to add the path of your local cache to your `.gitignore` so you do + not commit your cache. + + +### How Caching Works: + +For regular generations, you store and retrieve complete responses. Instead, the streaming implementation captures each token as it arrives, stores the full sequence, and on cache hits uses the SDK's `simulateReadableStream` utility to recreate the token-by-token streaming experience at a controlled speed (defaults to 10ms between chunks). + +This approach gives you the best of both worlds: + +- Instant responses for repeated queries +- Preserved streaming behavior for UI development + +The middleware handles all transformations needed to make cached responses indistinguishable from fresh ones, including normalizing tool calls and fixing timestamp formats. + +```ts +import { + type LanguageModelV1, + type LanguageModelV1Middleware, + LanguageModelV1Prompt, + type LanguageModelV1StreamPart, + simulateReadableStream, + wrapLanguageModel, +} from 'ai'; +import 'dotenv/config'; +import fs from 'fs'; +import path from 'path'; + +const CACHE_FILE = path.join(process.cwd(), '.cache/ai-cache.json'); + +export const cached = (model: LanguageModelV1) => + wrapLanguageModel({ + middleware: cacheMiddleware, + model, + }); + +const ensureCacheFile = () => { + const cacheDir = path.dirname(CACHE_FILE); + if (!fs.existsSync(cacheDir)) { + fs.mkdirSync(cacheDir, { recursive: true }); + } + if (!fs.existsSync(CACHE_FILE)) { + fs.writeFileSync(CACHE_FILE, '{}'); + } +}; + +const getCachedResult = (key: string | object) => { + ensureCacheFile(); + const cacheKey = typeof key === 'object' ? JSON.stringify(key) : key; + try { + const cacheContent = fs.readFileSync(CACHE_FILE, 'utf-8'); + + const cache = JSON.parse(cacheContent); + + const result = cache[cacheKey]; + + return result ?? null; + } catch (error) { + console.error('Cache error:', error); + return null; + } +}; + +const updateCache = (key: string, value: any) => { + ensureCacheFile(); + try { + const cache = JSON.parse(fs.readFileSync(CACHE_FILE, 'utf-8')); + const updatedCache = { ...cache, [key]: value }; + fs.writeFileSync(CACHE_FILE, JSON.stringify(updatedCache, null, 2)); + console.log('Cache updated for key:', key); + } catch (error) { + console.error('Failed to update cache:', error); + } +}; +const cleanPrompt = (prompt: LanguageModelV1Prompt) => { + return prompt.map(m => { + if (m.role === 'assistant') { + return m.content.map(part => + part.type === 'tool-call' ? { ...part, toolCallId: 'cached' } : part, + ); + } + if (m.role === 'tool') { + return m.content.map(tc => ({ + ...tc, + toolCallId: 'cached', + result: {}, + })); + } + + return m; + }); +}; + +export const cacheMiddleware: LanguageModelV1Middleware = { + wrapGenerate: async ({ doGenerate, params }) => { + const cacheKey = JSON.stringify({ + ...cleanPrompt(params.prompt), + _function: 'generate', + }); + console.log('Cache Key:', cacheKey); + + const cached = getCachedResult(cacheKey) as Awaited< + ReturnType + > | null; + + if (cached && cached !== null) { + console.log('Cache Hit'); + return { + ...cached, + response: { + ...cached.response, + timestamp: cached?.response?.timestamp + ? new Date(cached?.response?.timestamp) + : undefined, + }, + }; + } + + console.log('Cache Miss'); + const result = await doGenerate(); + + updateCache(cacheKey, result); + + return result; + }, + wrapStream: async ({ doStream, params }) => { + const cacheKey = JSON.stringify({ + ...cleanPrompt(params.prompt), + _function: 'stream', + }); + console.log('Cache Key:', cacheKey); + + // Check if the result is in the cache + const cached = getCachedResult(cacheKey); + + // If cached, return a simulated ReadableStream that yields the cached result + if (cached && cached !== null) { + console.log('Cache Hit'); + // Format the timestamps in the cached response + const formattedChunks = (cached as LanguageModelV1StreamPart[]).map(p => { + if (p.type === 'response-metadata' && p.timestamp) { + return { ...p, timestamp: new Date(p.timestamp) }; + } else return p; + }); + return { + stream: simulateReadableStream({ + initialDelayInMs: 0, + chunkDelayInMs: 10, + chunks: formattedChunks, + }), + rawCall: { rawPrompt: null, rawSettings: {} }, + }; + } + + console.log('Cache Miss'); + // If not cached, proceed with streaming + const { stream, ...rest } = await doStream(); + + const fullResponse: LanguageModelV1StreamPart[] = []; + + const transformStream = new TransformStream< + LanguageModelV1StreamPart, + LanguageModelV1StreamPart + >({ + transform(chunk, controller) { + fullResponse.push(chunk); + controller.enqueue(chunk); + }, + flush() { + // Store the full response in the cache after streaming is complete + updateCache(cacheKey, fullResponse); + }, + }); + + return { + stream: stream.pipeThrough(transformStream), + ...rest, + }; + }, +}; +``` + +## Using the Middleware + +The middleware can be easily integrated into your existing AI SDK setup: + +```ts highlight="4,8" +import { openai } from '@ai-sdk/openai'; +import { streamText } from 'ai'; +import 'dotenv/config'; +import { cached } from '../middleware/your-cache-middleware'; + +async function main() { + const result = streamText({ + model: cached(openai('gpt-4o')), + maxTokens: 512, + temperature: 0.3, + maxRetries: 5, + prompt: 'Invent a new holiday and describe its traditions.', + }); + + for await (const textPart of result.textStream) { + process.stdout.write(textPart); + } + + console.log(); + console.log('Token usage:', await result.usage); + console.log('Finish reason:', await result.finishReason); +} + +main().catch(console.error); +``` + +## Considerations + +When using this caching middleware, keep these points in mind: + +1. **Development Only** - This approach is intended for local development, not production environments +2. **Cache Invalidation** - You'll need to clear the cache (delete the cache file) when you want fresh responses +3. **maxSteps behaviour** - When using `maxSteps`, be aware that the caching only applies to the language model's responses, not the entire tool execution pipeline. + + + When the model decides to use tools, it generates a tool call which the AI SDK + parses and executes. The tool result is then appended to the message history. + This middleware only caches the language model's generations, not the actual + results from tool executions, which may lead to inconsistencies if tool + outputs change between runs. Also be aware the that tool call will be + executed. + From d76732852a73a05f5853627f08b857d3d141b7b1 Mon Sep 17 00:00:00 2001 From: Nico Albanese Date: Thu, 3 Apr 2025 16:48:30 +0100 Subject: [PATCH 2/4] update --- content/cookbook/05-node/80-local-caching-middleware.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/cookbook/05-node/80-local-caching-middleware.mdx b/content/cookbook/05-node/80-local-caching-middleware.mdx index 522a1781abfd..3a4204ca6b59 100644 --- a/content/cookbook/05-node/80-local-caching-middleware.mdx +++ b/content/cookbook/05-node/80-local-caching-middleware.mdx @@ -25,7 +25,7 @@ In this implementation, you create a JSON file to store responses. When a reques Make sure to add the path of your local cache to your `.gitignore` so you do - not commit your cache. + not commit it. ### How Caching Works: From 760cc4da0cd6f28e4a92960f798947c092d02959 Mon Sep 17 00:00:00 2001 From: Nico Albanese Date: Thu, 3 Apr 2025 16:49:56 +0100 Subject: [PATCH 3/4] update --- .../cookbook/05-node/80-local-caching-middleware.mdx | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/content/cookbook/05-node/80-local-caching-middleware.mdx b/content/cookbook/05-node/80-local-caching-middleware.mdx index 3a4204ca6b59..110d65c462c3 100644 --- a/content/cookbook/05-node/80-local-caching-middleware.mdx +++ b/content/cookbook/05-node/80-local-caching-middleware.mdx @@ -17,18 +17,12 @@ This approach is particularly useful in two scenarios: In this implementation, you create a JSON file to store responses. When a request is made, you first check if you have already seen this exact request. If you have, you return the cached response immediately (as a one-off generation or chunks of tokens). If not, you trigger the generation, save the response, and return it. -### Key Components: - -- **Cache Storage**: Uses a JSON file (`.cache/ai-cache.json` can be anyhwere you want) to persist responses between app restarts -- **Middleware Functions**: Intercepts both single-response (`wrapGenerate`) and streaming (`wrapStream`) API calls -- **Request Fingerprinting**: Creates unique keys based on the prompt and parameters - Make sure to add the path of your local cache to your `.gitignore` so you do not commit it. -### How Caching Works: +### How it works For regular generations, you store and retrieve complete responses. Instead, the streaming implementation captures each token as it arrives, stores the full sequence, and on cache hits uses the SDK's `simulateReadableStream` utility to recreate the token-by-token streaming experience at a controlled speed (defaults to 10ms between chunks). @@ -39,6 +33,8 @@ This approach gives you the best of both worlds: The middleware handles all transformations needed to make cached responses indistinguishable from fresh ones, including normalizing tool calls and fixing timestamp formats. +### Middleware + ```ts import { type LanguageModelV1, From f68a6f22c273de238eb37c3dd84b08533730d71e Mon Sep 17 00:00:00 2001 From: Nico Albanese Date: Fri, 4 Apr 2025 10:45:13 +0200 Subject: [PATCH 4/4] update --- .../cookbook/05-node/80-local-caching-middleware.mdx | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/content/cookbook/05-node/80-local-caching-middleware.mdx b/content/cookbook/05-node/80-local-caching-middleware.mdx index 110d65c462c3..e2a95a51287d 100644 --- a/content/cookbook/05-node/80-local-caching-middleware.mdx +++ b/content/cookbook/05-node/80-local-caching-middleware.mdx @@ -239,13 +239,4 @@ When using this caching middleware, keep these points in mind: 1. **Development Only** - This approach is intended for local development, not production environments 2. **Cache Invalidation** - You'll need to clear the cache (delete the cache file) when you want fresh responses -3. **maxSteps behaviour** - When using `maxSteps`, be aware that the caching only applies to the language model's responses, not the entire tool execution pipeline. - - - When the model decides to use tools, it generates a tool call which the AI SDK - parses and executes. The tool result is then appended to the message history. - This middleware only caches the language model's generations, not the actual - results from tool executions, which may lead to inconsistencies if tool - outputs change between runs. Also be aware the that tool call will be - executed. - +3. **Multi-Step Flows** - When using `maxSteps`, be aware that caching occurs at the individual language model response level, not across the entire execution flow. This means that while the model's generation is cached, the tool call is not and will run on each generation.