From 59d70a9c70b49e5dc92e1e9cce9c1c81ccbd46f0 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Fri, 3 Apr 2026 12:00:04 +0000 Subject: [PATCH 01/17] Add INLINE + ARROW_STREAM format support for analytics plugin Some serverless warehouses only support ARROW_STREAM with INLINE disposition, but the analytics plugin only offered JSON_ARRAY (INLINE) and ARROW_STREAM (EXTERNAL_LINKS). This adds a new "ARROW_STREAM" format option that uses INLINE disposition, making the plugin compatible with these warehouses. Fixes https://github.com/databricks/appkit/issues/242 --- packages/appkit/src/plugins/analytics/analytics.ts | 14 +++++++++++--- packages/appkit/src/plugins/analytics/types.ts | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/packages/appkit/src/plugins/analytics/analytics.ts b/packages/appkit/src/plugins/analytics/analytics.ts index a9c688dac..481ef5e18 100644 --- a/packages/appkit/src/plugins/analytics/analytics.ts +++ b/packages/appkit/src/plugins/analytics/analytics.ts @@ -159,9 +159,17 @@ export class AnalyticsPlugin extends Plugin { }, type: "arrow", } - : { - type: "result", - }; + : format === "ARROW_STREAM" + ? { + formatParameters: { + disposition: "INLINE", + format: "ARROW_STREAM", + }, + type: "result", + } + : { + type: "result", + }; const hashedQuery = this.queryProcessor.hashQuery(query); diff --git a/packages/appkit/src/plugins/analytics/types.ts b/packages/appkit/src/plugins/analytics/types.ts index c58b6ecfe..bc7568f9c 100644 --- a/packages/appkit/src/plugins/analytics/types.ts +++ b/packages/appkit/src/plugins/analytics/types.ts @@ -4,7 +4,7 @@ export interface IAnalyticsConfig extends BasePluginConfig { timeout?: number; } -export type AnalyticsFormat = "JSON" | "ARROW"; +export type AnalyticsFormat = "JSON" | "ARROW" | "ARROW_STREAM"; export interface IAnalyticsQueryRequest { parameters?: Record; format?: AnalyticsFormat; From b1566eaa93162b5d1d5b5749cd9baab286c57fbe Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Fri, 3 Apr 2026 17:26:09 +0000 Subject: [PATCH 02/17] Add tests for ARROW_STREAM and ARROW format parameter handling Tests verify: - ARROW_STREAM format passes INLINE disposition + ARROW_STREAM format - ARROW format passes EXTERNAL_LINKS disposition + ARROW_STREAM format - Default JSON format does not pass disposition or format overrides --- .../plugins/analytics/tests/analytics.test.ts | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts index 9a30440ed..2051d2f6e 100644 --- a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts +++ b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts @@ -584,6 +584,110 @@ describe("Analytics Plugin", () => { ); }); + test("/query/:query_key should pass INLINE + ARROW_STREAM format parameters when format is ARROW_STREAM", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi.fn().mockResolvedValue({ + result: { data: [{ id: 1 }] }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW_STREAM" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + expect(executeMock).toHaveBeenCalledWith( + expect.anything(), + expect.objectContaining({ + statement: "SELECT * FROM test", + warehouse_id: "test-warehouse-id", + disposition: "INLINE", + format: "ARROW_STREAM", + }), + expect.any(AbortSignal), + ); + }); + + test("/query/:query_key should pass EXTERNAL_LINKS + ARROW_STREAM format parameters when format is ARROW", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi.fn().mockResolvedValue({ + result: { data: [{ id: 1 }] }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + expect(executeMock).toHaveBeenCalledWith( + expect.anything(), + expect.objectContaining({ + statement: "SELECT * FROM test", + warehouse_id: "test-warehouse-id", + disposition: "EXTERNAL_LINKS", + format: "ARROW_STREAM", + }), + expect.any(AbortSignal), + ); + }); + + test("/query/:query_key should not pass format parameters when format is JSON (default)", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi.fn().mockResolvedValue({ + result: { data: [{ id: 1 }] }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {} }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + const callArgs = executeMock.mock.calls[0][1]; + expect(callArgs).not.toHaveProperty("disposition"); + expect(callArgs).not.toHaveProperty("format"); + }); + test("should return 404 when query file is not found", async () => { const plugin = new AnalyticsPlugin(config); const { router, getHandler } = createMockRouter(); From dbe8ea3b76a05453546bc1187d7e75db5bb3549d Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Fri, 3 Apr 2026 18:10:10 +0000 Subject: [PATCH 03/17] fix: propagate ARROW_STREAM format to UI layer and typegen The server-side ARROW_STREAM format added in the previous commit was not exposed to the frontend or typegen: - Add "ARROW_STREAM" to AnalyticsFormat in appkit-ui hooks - Add "arrow_stream" to DataFormat in chart types - Handle "arrow_stream" in useChartData's resolveFormat() - Make typegen resilient to ARROW_STREAM-only warehouses by retrying DESCRIBE QUERY without format when JSON_ARRAY is rejected Co-authored-by: Isaac Signed-off-by: James Broadhead --- packages/appkit-ui/src/react/charts/types.ts | 2 +- packages/appkit-ui/src/react/hooks/types.ts | 2 +- .../src/react/hooks/use-chart-data.ts | 3 +- .../src/type-generator/query-registry.ts | 30 ++++++++++++++++--- 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/packages/appkit-ui/src/react/charts/types.ts b/packages/appkit-ui/src/react/charts/types.ts index 65804a741..fdcc55f1a 100644 --- a/packages/appkit-ui/src/react/charts/types.ts +++ b/packages/appkit-ui/src/react/charts/types.ts @@ -5,7 +5,7 @@ import type { Table } from "apache-arrow"; // ============================================================================ /** Supported data formats for analytics queries */ -export type DataFormat = "json" | "arrow" | "auto"; +export type DataFormat = "json" | "arrow" | "arrow_stream" | "auto"; /** Chart orientation */ export type Orientation = "vertical" | "horizontal"; diff --git a/packages/appkit-ui/src/react/hooks/types.ts b/packages/appkit-ui/src/react/hooks/types.ts index 03e943e2a..3d539c0f3 100644 --- a/packages/appkit-ui/src/react/hooks/types.ts +++ b/packages/appkit-ui/src/react/hooks/types.ts @@ -5,7 +5,7 @@ import type { Table } from "apache-arrow"; // ============================================================================ /** Supported response formats for analytics queries */ -export type AnalyticsFormat = "JSON" | "ARROW"; +export type AnalyticsFormat = "JSON" | "ARROW" | "ARROW_STREAM"; /** * Typed Arrow Table - preserves row type information for type inference. diff --git a/packages/appkit-ui/src/react/hooks/use-chart-data.ts b/packages/appkit-ui/src/react/hooks/use-chart-data.ts index d8d0bd386..8b209faa6 100644 --- a/packages/appkit-ui/src/react/hooks/use-chart-data.ts +++ b/packages/appkit-ui/src/react/hooks/use-chart-data.ts @@ -50,10 +50,11 @@ export interface UseChartDataResult { function resolveFormat( format: DataFormat, parameters?: Record, -): "JSON" | "ARROW" { +): "JSON" | "ARROW" | "ARROW_STREAM" { // Explicit format selection if (format === "json") return "JSON"; if (format === "arrow") return "ARROW"; + if (format === "arrow_stream") return "ARROW_STREAM"; // Auto-selection heuristics if (format === "auto") { diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts index 196690c2d..4dbdb2596 100644 --- a/packages/appkit/src/type-generator/query-registry.ts +++ b/packages/appkit/src/type-generator/query-registry.ts @@ -386,10 +386,32 @@ export async function generateQueriesFromDescribe( sqlHash, cleanedSql, }: (typeof uncachedQueries)[number]): Promise => { - const result = (await client.statementExecution.executeStatement({ - statement: `DESCRIBE QUERY ${cleanedSql}`, - warehouse_id: warehouseId, - })) as DatabricksStatementExecutionResponse; + let result: DatabricksStatementExecutionResponse; + try { + // Prefer JSON_ARRAY for predictable data_array parsing. + result = (await client.statementExecution.executeStatement({ + statement: `DESCRIBE QUERY ${cleanedSql}`, + warehouse_id: warehouseId, + format: "JSON_ARRAY", + disposition: "INLINE", + })) as DatabricksStatementExecutionResponse; + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + if (msg.includes("ARROW_STREAM") || msg.includes("JSON_ARRAY")) { + // Warehouse doesn't support JSON_ARRAY inline — retry with no format + // to let it use its default (typically ARROW_STREAM inline). + logger.debug( + "Warehouse rejected JSON_ARRAY for %s, retrying with default format", + queryName, + ); + result = (await client.statementExecution.executeStatement({ + statement: `DESCRIBE QUERY ${cleanedSql}`, + warehouse_id: warehouseId, + })) as DatabricksStatementExecutionResponse; + } else { + throw err; + } + } completed++; spinner.update( From 8fe05d8c4ded9efc244af87723a0dfc4c590a017 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Fri, 3 Apr 2026 18:14:53 +0000 Subject: [PATCH 04/17] fix: default analytics format to ARROW_STREAM for broadest warehouse compatibility ARROW_STREAM with INLINE disposition is the only format that works across all warehouse types, including serverless warehouses that reject JSON_ARRAY. Change the default from JSON to ARROW_STREAM throughout: - Server: defaults.ts, analytics plugin request handler - Client: useAnalyticsQuery, UseAnalyticsQueryOptions, useChartData - Tests: update assertions for new default JSON and ARROW formats remain available via explicit format parameter. Co-authored-by: Isaac Signed-off-by: James Broadhead --- .../hooks/__tests__/use-chart-data.test.ts | 8 ++-- packages/appkit-ui/src/react/hooks/types.ts | 6 ++- .../src/react/hooks/use-analytics-query.ts | 4 +- .../src/react/hooks/use-chart-data.ts | 4 +- .../src/connectors/sql-warehouse/defaults.ts | 2 +- .../appkit/src/plugins/analytics/analytics.ts | 3 +- .../plugins/analytics/tests/analytics.test.ts | 37 ++++++++++++++++++- 7 files changed, 51 insertions(+), 13 deletions(-) diff --git a/packages/appkit-ui/src/react/hooks/__tests__/use-chart-data.test.ts b/packages/appkit-ui/src/react/hooks/__tests__/use-chart-data.test.ts index 3d5e96f11..32ce52cb2 100644 --- a/packages/appkit-ui/src/react/hooks/__tests__/use-chart-data.test.ts +++ b/packages/appkit-ui/src/react/hooks/__tests__/use-chart-data.test.ts @@ -205,7 +205,7 @@ describe("useChartData", () => { ); }); - test("auto-selects JSON by default when no heuristics match", () => { + test("auto-selects ARROW_STREAM by default when no heuristics match", () => { mockUseAnalyticsQuery.mockReturnValue({ data: [], loading: false, @@ -223,11 +223,11 @@ describe("useChartData", () => { expect(mockUseAnalyticsQuery).toHaveBeenCalledWith( "test", { limit: 100 }, - expect.objectContaining({ format: "JSON" }), + expect.objectContaining({ format: "ARROW_STREAM" }), ); }); - test("defaults to auto format (JSON) when format is not specified", () => { + test("defaults to auto format (ARROW_STREAM) when format is not specified", () => { mockUseAnalyticsQuery.mockReturnValue({ data: [], loading: false, @@ -243,7 +243,7 @@ describe("useChartData", () => { expect(mockUseAnalyticsQuery).toHaveBeenCalledWith( "test", undefined, - expect.objectContaining({ format: "JSON" }), + expect.objectContaining({ format: "ARROW_STREAM" }), ); }); }); diff --git a/packages/appkit-ui/src/react/hooks/types.ts b/packages/appkit-ui/src/react/hooks/types.ts index 3d539c0f3..26406f140 100644 --- a/packages/appkit-ui/src/react/hooks/types.ts +++ b/packages/appkit-ui/src/react/hooks/types.ts @@ -32,8 +32,10 @@ export interface TypedArrowTable< // ============================================================================ /** Options for configuring an analytics SSE query */ -export interface UseAnalyticsQueryOptions { - /** Response format - "JSON" returns typed arrays, "ARROW" returns TypedArrowTable */ +export interface UseAnalyticsQueryOptions< + F extends AnalyticsFormat = "ARROW_STREAM", +> { + /** Response format - "ARROW_STREAM" (default) uses inline Arrow, "JSON" returns typed arrays, "ARROW" uses external links */ format?: F; /** Maximum size of serialized parameters in bytes */ diff --git a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts index 24e03ea3b..7d13648f4 100644 --- a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts +++ b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts @@ -54,13 +54,13 @@ function getArrowStreamUrl(id: string) { export function useAnalyticsQuery< T = unknown, K extends QueryKey = QueryKey, - F extends AnalyticsFormat = "JSON", + F extends AnalyticsFormat = "ARROW_STREAM", >( queryKey: K, parameters?: InferParams | null, options: UseAnalyticsQueryOptions = {} as UseAnalyticsQueryOptions, ): UseAnalyticsQueryResult> { - const format = options?.format ?? "JSON"; + const format = options?.format ?? "ARROW_STREAM"; const maxParametersSize = options?.maxParametersSize ?? 100 * 1024; const autoStart = options?.autoStart ?? true; diff --git a/packages/appkit-ui/src/react/hooks/use-chart-data.ts b/packages/appkit-ui/src/react/hooks/use-chart-data.ts index 8b209faa6..1d1da2dda 100644 --- a/packages/appkit-ui/src/react/hooks/use-chart-data.ts +++ b/packages/appkit-ui/src/react/hooks/use-chart-data.ts @@ -73,10 +73,10 @@ function resolveFormat( return "ARROW"; } - return "JSON"; + return "ARROW_STREAM"; } - return "JSON"; + return "ARROW_STREAM"; } // ============================================================================ diff --git a/packages/appkit/src/connectors/sql-warehouse/defaults.ts b/packages/appkit/src/connectors/sql-warehouse/defaults.ts index 994f11da5..506fa52dc 100644 --- a/packages/appkit/src/connectors/sql-warehouse/defaults.ts +++ b/packages/appkit/src/connectors/sql-warehouse/defaults.ts @@ -12,7 +12,7 @@ interface ExecuteStatementDefaults { export const executeStatementDefaults: ExecuteStatementDefaults = { wait_timeout: "30s", disposition: "INLINE", - format: "JSON_ARRAY", + format: "ARROW_STREAM", on_wait_timeout: "CONTINUE", timeout: 60000, }; diff --git a/packages/appkit/src/plugins/analytics/analytics.ts b/packages/appkit/src/plugins/analytics/analytics.ts index 481ef5e18..b32e5b9f8 100644 --- a/packages/appkit/src/plugins/analytics/analytics.ts +++ b/packages/appkit/src/plugins/analytics/analytics.ts @@ -115,7 +115,8 @@ export class AnalyticsPlugin extends Plugin { res: express.Response, ): Promise { const { query_key } = req.params; - const { parameters, format = "JSON" } = req.body as IAnalyticsQueryRequest; + const { parameters, format = "ARROW_STREAM" } = + req.body as IAnalyticsQueryRequest; // Request-scoped logging with WideEvent tracking logger.debug(req, "Executing query: %s (format=%s)", query_key, format); diff --git a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts index 2051d2f6e..092c92ed1 100644 --- a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts +++ b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts @@ -658,7 +658,7 @@ describe("Analytics Plugin", () => { ); }); - test("/query/:query_key should not pass format parameters when format is JSON (default)", async () => { + test("/query/:query_key should use INLINE + ARROW_STREAM by default when no format specified", async () => { const plugin = new AnalyticsPlugin(config); const { router, getHandler } = createMockRouter(); @@ -683,6 +683,41 @@ describe("Analytics Plugin", () => { await handler(mockReq, mockRes); + expect(executeMock).toHaveBeenCalledWith( + expect.anything(), + expect.objectContaining({ + disposition: "INLINE", + format: "ARROW_STREAM", + }), + expect.any(AbortSignal), + ); + }); + + test("/query/:query_key should not pass format parameters when format is explicitly JSON", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi.fn().mockResolvedValue({ + result: { data: [{ id: 1 }] }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "JSON" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + const callArgs = executeMock.mock.calls[0][1]; expect(callArgs).not.toHaveProperty("disposition"); expect(callArgs).not.toHaveProperty("format"); From 4725c97fcf638e3686d329c71a802b2f37349364 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Fri, 3 Apr 2026 18:21:36 +0000 Subject: [PATCH 05/17] feat: automatic format fallback for warehouse compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using the default ARROW_STREAM format, the analytics plugin now automatically falls back through formats if the warehouse rejects one: ARROW_STREAM → JSON → ARROW. This handles warehouses that only support a subset of format/disposition combinations without requiring users to know their warehouse's capabilities. Explicit format requests (JSON, ARROW) are respected without fallback. Co-authored-by: Isaac Signed-off-by: James Broadhead --- .../appkit/src/plugins/analytics/analytics.ts | 125 +++++++++++--- .../plugins/analytics/tests/analytics.test.ts | 153 ++++++++++++++++++ 2 files changed, 253 insertions(+), 25 deletions(-) diff --git a/packages/appkit/src/plugins/analytics/analytics.ts b/packages/appkit/src/plugins/analytics/analytics.ts index b32e5b9f8..81811e9d9 100644 --- a/packages/appkit/src/plugins/analytics/analytics.ts +++ b/packages/appkit/src/plugins/analytics/analytics.ts @@ -15,6 +15,7 @@ import { queryDefaults } from "./defaults"; import manifest from "./manifest.json"; import { QueryProcessor } from "./query"; import type { + AnalyticsFormat, AnalyticsQueryResponse, IAnalyticsConfig, IAnalyticsQueryRequest, @@ -151,27 +152,6 @@ export class AnalyticsPlugin extends Plugin { const executor = isAsUser ? this.asUser(req) : this; const executorKey = isAsUser ? this.resolveUserId(req) : "global"; - const queryParameters = - format === "ARROW" - ? { - formatParameters: { - disposition: "EXTERNAL_LINKS", - format: "ARROW_STREAM", - }, - type: "arrow", - } - : format === "ARROW_STREAM" - ? { - formatParameters: { - disposition: "INLINE", - format: "ARROW_STREAM", - }, - type: "result", - } - : { - type: "result", - }; - const hashedQuery = this.queryProcessor.hashQuery(query); const defaultConfig: PluginExecuteConfig = { @@ -201,20 +181,115 @@ export class AnalyticsPlugin extends Plugin { parameters, ); - const result = await executor.query( + return this._executeWithFormatFallback( + executor, query, processedParams, - queryParameters.formatParameters, + format, signal, ); - - return { type: queryParameters.type, ...result }; }, streamExecutionSettings, executorKey, ); } + /** Format configurations in fallback order. */ + private static readonly FORMAT_CONFIGS = { + ARROW_STREAM: { + formatParameters: { disposition: "INLINE", format: "ARROW_STREAM" }, + type: "result" as const, + }, + JSON: { + formatParameters: undefined, + type: "result" as const, + }, + ARROW: { + formatParameters: { + disposition: "EXTERNAL_LINKS", + format: "ARROW_STREAM", + }, + type: "arrow" as const, + }, + }; + + /** + * Execute a query with automatic format fallback. + * + * For the default ARROW_STREAM format, tries formats in order until one + * succeeds: ARROW_STREAM → JSON → ARROW. This handles warehouses that + * only support a subset of format/disposition combinations. + * + * Explicit format requests (JSON, ARROW) are not retried. + */ + private async _executeWithFormatFallback( + executor: AnalyticsPlugin, + query: string, + processedParams: + | Record + | undefined, + requestedFormat: AnalyticsFormat, + signal?: AbortSignal, + ): Promise<{ type: string; [key: string]: any }> { + // Explicit format — no fallback. + if (requestedFormat === "JSON" || requestedFormat === "ARROW") { + const config = AnalyticsPlugin.FORMAT_CONFIGS[requestedFormat]; + const result = await executor.query( + query, + processedParams, + config.formatParameters, + signal, + ); + return { type: config.type, ...result }; + } + + // Default (ARROW_STREAM) — try each format in order. + const fallbackOrder: AnalyticsFormat[] = ["ARROW_STREAM", "JSON", "ARROW"]; + + for (let i = 0; i < fallbackOrder.length; i++) { + const fmt = fallbackOrder[i]; + const config = AnalyticsPlugin.FORMAT_CONFIGS[fmt]; + try { + const result = await executor.query( + query, + processedParams, + config.formatParameters, + signal, + ); + if (i > 0) { + logger.info( + "Query succeeded with fallback format %s (preferred %s was rejected)", + fmt, + fallbackOrder[0], + ); + } + return { type: config.type, ...result }; + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + const isFormatError = + msg.includes("ARROW_STREAM") || + msg.includes("JSON_ARRAY") || + msg.includes("EXTERNAL_LINKS") || + msg.includes("INVALID_PARAMETER_VALUE") || + msg.includes("NOT_IMPLEMENTED"); + + if (!isFormatError || i === fallbackOrder.length - 1) { + throw err; + } + + logger.warn( + "Format %s rejected by warehouse, falling back to %s: %s", + fmt, + fallbackOrder[i + 1], + msg, + ); + } + } + + // Unreachable — last format in fallbackOrder throws on failure. + throw new Error("All format fallbacks exhausted"); + } + /** * Execute a SQL query using the current execution context. * diff --git a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts index 092c92ed1..a57fea02c 100644 --- a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts +++ b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts @@ -723,6 +723,159 @@ describe("Analytics Plugin", () => { expect(callArgs).not.toHaveProperty("format"); }); + test("/query/:query_key should fall back from ARROW_STREAM to JSON when warehouse rejects ARROW_STREAM", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi + .fn() + .mockRejectedValueOnce( + new Error( + "INVALID_PARAMETER_VALUE: Inline disposition only supports JSON_ARRAY format", + ), + ) + .mockResolvedValueOnce({ + result: { data: [{ id: 1 }] }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {} }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // First call: ARROW_STREAM (rejected) + expect(executeMock.mock.calls[0][1]).toMatchObject({ + disposition: "INLINE", + format: "ARROW_STREAM", + }); + // Second call: JSON (no format params, uses defaults) + const secondCallArgs = executeMock.mock.calls[1][1]; + expect(secondCallArgs).not.toHaveProperty("disposition"); + expect(secondCallArgs).not.toHaveProperty("format"); + }); + + test("/query/:query_key should fall back through all formats when each is rejected", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi + .fn() + .mockRejectedValueOnce( + new Error("INVALID_PARAMETER_VALUE: only supports JSON_ARRAY"), + ) + .mockRejectedValueOnce( + new Error("INVALID_PARAMETER_VALUE: only supports ARROW_STREAM"), + ) + .mockResolvedValueOnce({ + result: { data: [{ id: 1 }] }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {} }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + expect(executeMock).toHaveBeenCalledTimes(3); + // Third call: ARROW (EXTERNAL_LINKS) + expect(executeMock.mock.calls[2][1]).toMatchObject({ + disposition: "EXTERNAL_LINKS", + format: "ARROW_STREAM", + }); + }); + + test("/query/:query_key should not fall back for non-format errors", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi + .fn() + .mockRejectedValue(new Error("PERMISSION_DENIED: no access")); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {} }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // All calls use same format (ARROW_STREAM) — no format fallback occurred. + // (executeStream's retry interceptor may retry, but always with the same format.) + for (const call of executeMock.mock.calls) { + expect(call[1]).toMatchObject({ + disposition: "INLINE", + format: "ARROW_STREAM", + }); + } + }); + + test("/query/:query_key should not fall back when format is explicitly JSON", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi + .fn() + .mockRejectedValue( + new Error("INVALID_PARAMETER_VALUE: only supports ARROW_STREAM"), + ); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "JSON" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // All calls have no disposition/format — explicit JSON uses defaults, no fallback. + for (const call of executeMock.mock.calls) { + expect(call[1]).not.toHaveProperty("disposition"); + expect(call[1]).not.toHaveProperty("format"); + } + }); + test("should return 404 when query file is not found", async () => { const plugin = new AnalyticsPlugin(config); const { router, getHandler } = createMockRouter(); From a4ad7b0e206193367dea364c2620d3d7c40f27f1 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Tue, 14 Apr 2026 17:15:53 +0000 Subject: [PATCH 06/17] fix: handle ARROW_STREAM + INLINE data in _transformDataArray Previously, _transformDataArray unconditionally called updateWithArrowStatus for any ARROW_STREAM response, which discards inline data and returns only statement_id + status. This was designed for EXTERNAL_LINKS (where data is fetched separately) but broke INLINE disposition where data is in data_array. Changes: - _transformDataArray now checks for data_array before routing to the EXTERNAL_LINKS path: if data_array is present, it falls through to the standard row-to-object transform. - JSON format now explicitly sends JSON_ARRAY + INLINE rather than relying on connector defaults. This prevents the connector default format from leaking into explicit JSON requests. - Connector defaults reverted to JSON_ARRAY for backward compatibility with classic warehouses (the analytics plugin sets formats explicitly). - Added connector-level tests for _transformDataArray covering ARROW_STREAM + INLINE, ARROW_STREAM + EXTERNAL_LINKS, and JSON_ARRAY paths. Co-authored-by: Isaac Signed-off-by: James Broadhead --- .../src/connectors/sql-warehouse/client.ts | 7 +- .../src/connectors/sql-warehouse/defaults.ts | 2 +- .../sql-warehouse/tests/client.test.ts | 153 ++++++++++++++++++ .../appkit/src/plugins/analytics/analytics.ts | 2 +- .../plugins/analytics/tests/analytics.test.ts | 24 +-- 5 files changed, 175 insertions(+), 13 deletions(-) create mode 100644 packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts diff --git a/packages/appkit/src/connectors/sql-warehouse/client.ts b/packages/appkit/src/connectors/sql-warehouse/client.ts index 4ab9344e8..0b962f969 100644 --- a/packages/appkit/src/connectors/sql-warehouse/client.ts +++ b/packages/appkit/src/connectors/sql-warehouse/client.ts @@ -393,7 +393,12 @@ export class SQLWarehouseConnector { private _transformDataArray(response: sql.StatementResponse) { if (response.manifest?.format === "ARROW_STREAM") { - return this.updateWithArrowStatus(response); + // INLINE disposition: data is in data_array, transform like JSON_ARRAY. + // EXTERNAL_LINKS disposition: data fetched separately via statement_id. + if (!response.result?.data_array) { + return this.updateWithArrowStatus(response); + } + // Fall through to the data_array transform below. } if (!response.result?.data_array || !response.manifest?.schema?.columns) { diff --git a/packages/appkit/src/connectors/sql-warehouse/defaults.ts b/packages/appkit/src/connectors/sql-warehouse/defaults.ts index 506fa52dc..994f11da5 100644 --- a/packages/appkit/src/connectors/sql-warehouse/defaults.ts +++ b/packages/appkit/src/connectors/sql-warehouse/defaults.ts @@ -12,7 +12,7 @@ interface ExecuteStatementDefaults { export const executeStatementDefaults: ExecuteStatementDefaults = { wait_timeout: "30s", disposition: "INLINE", - format: "ARROW_STREAM", + format: "JSON_ARRAY", on_wait_timeout: "CONTINUE", timeout: 60000, }; diff --git a/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts b/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts new file mode 100644 index 000000000..72fcc1ff3 --- /dev/null +++ b/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts @@ -0,0 +1,153 @@ +import type { sql } from "@databricks/sdk-experimental"; +import { describe, expect, test, vi } from "vitest"; + +// Mock all transitive dependencies to isolate _transformDataArray logic. +vi.mock("../../../telemetry", () => { + const mockMeter = { + createCounter: () => ({ add: vi.fn() }), + createHistogram: () => ({ record: vi.fn() }), + }; + return { + TelemetryManager: { + getProvider: () => ({ + startActiveSpan: vi.fn(), + getMeter: () => mockMeter, + }), + }, + SpanKind: { CLIENT: 1 }, + SpanStatusCode: { ERROR: 2 }, + }; +}); +vi.mock("../../../logging/logger", () => ({ + createLogger: () => ({ + info: vi.fn(), + debug: vi.fn(), + warn: vi.fn(), + error: vi.fn(), + event: () => null, + }), +})); +vi.mock("../../../stream/arrow-stream-processor", () => ({ + ArrowStreamProcessor: vi.fn(), +})); + +import { SQLWarehouseConnector } from "../client"; + +function createConnector() { + return new SQLWarehouseConnector({ timeout: 30000 }); +} + +describe("SQLWarehouseConnector._transformDataArray", () => { + test("transforms ARROW_STREAM + INLINE data_array into named objects", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { + columns: [ + { name: "id", type_name: "INT" }, + { name: "value", type_name: "STRING" }, + ], + }, + }, + result: { + data_array: [ + ["1", "hello"], + ["2", "world"], + ], + }, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result.result.data).toEqual([ + { id: "1", value: "hello" }, + { id: "2", value: "world" }, + ]); + expect(result.result.data_array).toBeUndefined(); + }); + + test("returns statement_id for ARROW_STREAM + EXTERNAL_LINKS (no data_array)", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { format: "ARROW_STREAM" }, + result: { + external_links: [ + { external_link: "https://storage.example.com/chunk0" }, + ], + }, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result.result.statement_id).toBe("stmt-1"); + expect(result.result.data).toBeUndefined(); + }); + + test("transforms JSON_ARRAY data_array into named objects", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { + format: "JSON_ARRAY", + schema: { + columns: [ + { name: "name", type_name: "STRING" }, + { name: "count", type_name: "INT" }, + ], + }, + }, + result: { + data_array: [ + ["Alice", "10"], + ["Bob", "20"], + ], + }, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result.result.data).toEqual([ + { name: "Alice", count: "10" }, + { name: "Bob", count: "20" }, + ]); + }); + + test("parses JSON strings in STRING columns for ARROW_STREAM + INLINE", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { + columns: [ + { name: "id", type_name: "INT" }, + { name: "metadata", type_name: "STRING" }, + ], + }, + }, + result: { + data_array: [["1", '{"key":"value"}']], + }, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result.result.data[0].metadata).toEqual({ key: "value" }); + }); + + test("returns response unchanged when no data_array or schema", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { format: "JSON_ARRAY" }, + result: {}, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result).toBe(response); + }); +}); diff --git a/packages/appkit/src/plugins/analytics/analytics.ts b/packages/appkit/src/plugins/analytics/analytics.ts index 81811e9d9..d73c5bbe6 100644 --- a/packages/appkit/src/plugins/analytics/analytics.ts +++ b/packages/appkit/src/plugins/analytics/analytics.ts @@ -201,7 +201,7 @@ export class AnalyticsPlugin extends Plugin { type: "result" as const, }, JSON: { - formatParameters: undefined, + formatParameters: { disposition: "INLINE", format: "JSON_ARRAY" }, type: "result" as const, }, ARROW: { diff --git a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts index a57fea02c..f39b07887 100644 --- a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts +++ b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts @@ -718,9 +718,10 @@ describe("Analytics Plugin", () => { await handler(mockReq, mockRes); - const callArgs = executeMock.mock.calls[0][1]; - expect(callArgs).not.toHaveProperty("disposition"); - expect(callArgs).not.toHaveProperty("format"); + expect(executeMock.mock.calls[0][1]).toMatchObject({ + disposition: "INLINE", + format: "JSON_ARRAY", + }); }); test("/query/:query_key should fall back from ARROW_STREAM to JSON when warehouse rejects ARROW_STREAM", async () => { @@ -760,10 +761,11 @@ describe("Analytics Plugin", () => { disposition: "INLINE", format: "ARROW_STREAM", }); - // Second call: JSON (no format params, uses defaults) - const secondCallArgs = executeMock.mock.calls[1][1]; - expect(secondCallArgs).not.toHaveProperty("disposition"); - expect(secondCallArgs).not.toHaveProperty("format"); + // Second call: JSON (explicit JSON_ARRAY + INLINE) + expect(executeMock.mock.calls[1][1]).toMatchObject({ + disposition: "INLINE", + format: "JSON_ARRAY", + }); }); test("/query/:query_key should fall back through all formats when each is rejected", async () => { @@ -869,10 +871,12 @@ describe("Analytics Plugin", () => { await handler(mockReq, mockRes); - // All calls have no disposition/format — explicit JSON uses defaults, no fallback. + // All calls use JSON_ARRAY + INLINE — explicit JSON, no fallback. for (const call of executeMock.mock.calls) { - expect(call[1]).not.toHaveProperty("disposition"); - expect(call[1]).not.toHaveProperty("format"); + expect(call[1]).toMatchObject({ + disposition: "INLINE", + format: "JSON_ARRAY", + }); } }); From 1e17f5f7bbe03ce544f87d97beec90a078997557 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Tue, 14 Apr 2026 19:56:47 +0000 Subject: [PATCH 07/17] feat: decode inline Arrow IPC attachments from serverless warehouses Some serverless warehouses return ARROW_STREAM + INLINE results as base64 Arrow IPC in `result.attachment` rather than `result.data_array`. This adds server-side decoding using apache-arrow's tableFromIPC to convert the attachment into row objects, producing the same response shape as JSON_ARRAY regardless of warehouse backend. This abstracts a Databricks internal implementation detail (different warehouses returning different response formats) so app developers get a consistent `type: "result"` response with named row objects. Changes: - Add apache-arrow@21.1.0 as a server dependency (already used client-side) - _transformDataArray detects `attachment` field and decodes via tableFromIPC - Connector tests use real base64 Arrow IPC captured from a live serverless warehouse, covering: classic JSON_ARRAY, classic EXTERNAL_LINKS, serverless INLINE attachment, data_array fallback, and edge cases Co-authored-by: Isaac Signed-off-by: James Broadhead --- packages/appkit/package.json | 1 + .../src/connectors/sql-warehouse/client.ts | 39 +- .../sql-warehouse/tests/client.test.ts | 333 ++++++++++++------ pnpm-lock.yaml | 6 +- 4 files changed, 274 insertions(+), 105 deletions(-) diff --git a/packages/appkit/package.json b/packages/appkit/package.json index 3b57014c0..04232f88b 100644 --- a/packages/appkit/package.json +++ b/packages/appkit/package.json @@ -69,6 +69,7 @@ "@opentelemetry/sdk-trace-base": "2.6.0", "@opentelemetry/semantic-conventions": "1.38.0", "@types/semver": "7.7.1", + "apache-arrow": "21.1.0", "dotenv": "16.6.1", "express": "4.22.0", "obug": "2.1.1", diff --git a/packages/appkit/src/connectors/sql-warehouse/client.ts b/packages/appkit/src/connectors/sql-warehouse/client.ts index 0b962f969..f844693f6 100644 --- a/packages/appkit/src/connectors/sql-warehouse/client.ts +++ b/packages/appkit/src/connectors/sql-warehouse/client.ts @@ -3,6 +3,7 @@ import { type sql, type WorkspaceClient, } from "@databricks/sdk-experimental"; +import { tableFromIPC } from "apache-arrow"; import type { TelemetryOptions } from "shared"; import { AppKitError, @@ -393,12 +394,20 @@ export class SQLWarehouseConnector { private _transformDataArray(response: sql.StatementResponse) { if (response.manifest?.format === "ARROW_STREAM") { - // INLINE disposition: data is in data_array, transform like JSON_ARRAY. - // EXTERNAL_LINKS disposition: data fetched separately via statement_id. - if (!response.result?.data_array) { + const result = response.result as any; + + // Inline Arrow: some warehouses return base64 Arrow IPC in `attachment`. + if (result?.attachment) { + return this._transformArrowAttachment(response, result.attachment); + } + + // Inline data_array: fall through to the row transform below. + if (result?.data_array) { + // Fall through. + } else { + // External links: data fetched separately via statement_id. return this.updateWithArrowStatus(response); } - // Fall through to the data_array transform below. } if (!response.result?.data_array || !response.manifest?.schema?.columns) { @@ -444,6 +453,28 @@ export class SQLWarehouseConnector { }; } + /** + * Decode a base64 Arrow IPC attachment into row objects. + * Some serverless warehouses return inline results as Arrow IPC in + * `result.attachment` rather than `result.data_array`. + */ + private _transformArrowAttachment( + response: sql.StatementResponse, + attachment: string, + ) { + const buf = Buffer.from(attachment, "base64"); + const table = tableFromIPC(buf); + const data = table.toArray().map((row) => row.toJSON()); + const { attachment: _att, ...restResult } = response.result as any; + return { + ...response, + result: { + ...restResult, + data, + }, + }; + } + private updateWithArrowStatus(response: sql.StatementResponse): { result: { statement_id: string; status: sql.StatementStatus }; } { diff --git a/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts b/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts index 72fcc1ff3..73bc8cda3 100644 --- a/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts +++ b/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts @@ -1,7 +1,6 @@ import type { sql } from "@databricks/sdk-experimental"; import { describe, expect, test, vi } from "vitest"; -// Mock all transitive dependencies to isolate _transformDataArray logic. vi.mock("../../../telemetry", () => { const mockMeter = { createCounter: () => ({ add: vi.fn() }), @@ -37,117 +36,251 @@ function createConnector() { return new SQLWarehouseConnector({ timeout: 30000 }); } +// Real base64 Arrow IPC from a serverless warehouse returning +// `SELECT 1 AS test_col, 2 AS test_col2` with INLINE + ARROW_STREAM. +// Contains schema (two INT columns) + one record batch with values [1, 2]. +const REAL_ARROW_ATTACHMENT = + "/////7gAAAAQAAAAAAAKAAwACgAJAAQACgAAABAAAAAAAQQACAAIAAAABAAIAAAABAAAAAIAAABMAAAABAAAAMz///8QAAAAGAAAAAAAAQIUAAAAvP///yAAAAAAAAABAAAAAAkAAAB0ZXN0X2NvbDIAAAAQABQAEAAOAA8ABAAAAAgAEAAAABgAAAAgAAAAAAABAhwAAAAIAAwABAALAAgAAAAgAAAAAAAAAQAAAAAIAAAAdGVzdF9jb2wAAAAA/////7gAAAAQAAAADAAaABgAFwAEAAgADAAAACAAAAAAAQAAAAAAAAAAAAAAAAADBAAKABgADAAIAAQACgAAADwAAAAQAAAAAQAAAAAAAAAAAAAAAgAAAAEAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAEAAAAAAAAAQAAAAAAAAAAEAAAAAAAAAIAAAAAAAAAAAQAAAAAAAADAAAAAAAAAAAQAAAAAAAAA/wAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD/AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP////8AAAAA"; + describe("SQLWarehouseConnector._transformDataArray", () => { - test("transforms ARROW_STREAM + INLINE data_array into named objects", () => { - const connector = createConnector(); - const response = { - statement_id: "stmt-1", - status: { state: "SUCCEEDED" }, - manifest: { - format: "ARROW_STREAM", - schema: { - columns: [ - { name: "id", type_name: "INT" }, - { name: "value", type_name: "STRING" }, - ], + describe("classic warehouse (JSON_ARRAY + INLINE)", () => { + test("transforms data_array rows into named objects", () => { + const connector = createConnector(); + // Real response shape from classic warehouse: INLINE + JSON_ARRAY + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { + format: "JSON_ARRAY", + schema: { + column_count: 2, + columns: [ + { + name: "test_col", + type_text: "INT", + type_name: "INT", + position: 0, + }, + { + name: "test_col2", + type_text: "INT", + type_name: "INT", + position: 1, + }, + ], + }, + total_row_count: 1, + truncated: false, }, - }, - result: { - data_array: [ - ["1", "hello"], - ["2", "world"], - ], - }, - } as unknown as sql.StatementResponse; - - const result = (connector as any)._transformDataArray(response); - expect(result.result.data).toEqual([ - { id: "1", value: "hello" }, - { id: "2", value: "world" }, - ]); - expect(result.result.data_array).toBeUndefined(); - }); + result: { + data_array: [["1", "2"]], + }, + } as unknown as sql.StatementResponse; - test("returns statement_id for ARROW_STREAM + EXTERNAL_LINKS (no data_array)", () => { - const connector = createConnector(); - const response = { - statement_id: "stmt-1", - status: { state: "SUCCEEDED" }, - manifest: { format: "ARROW_STREAM" }, - result: { - external_links: [ - { external_link: "https://storage.example.com/chunk0" }, - ], - }, - } as unknown as sql.StatementResponse; - - const result = (connector as any)._transformDataArray(response); - expect(result.result.statement_id).toBe("stmt-1"); - expect(result.result.data).toBeUndefined(); + const result = (connector as any)._transformDataArray(response); + expect(result.result.data).toEqual([{ test_col: "1", test_col2: "2" }]); + expect(result.result.data_array).toBeUndefined(); + }); + + test("parses JSON strings in STRING columns", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { + format: "JSON_ARRAY", + schema: { + columns: [ + { name: "id", type_name: "INT" }, + { name: "metadata", type_name: "STRING" }, + ], + }, + }, + result: { + data_array: [["1", '{"key":"value"}']], + }, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result.result.data[0].metadata).toEqual({ key: "value" }); + }); }); - test("transforms JSON_ARRAY data_array into named objects", () => { - const connector = createConnector(); - const response = { - statement_id: "stmt-1", - status: { state: "SUCCEEDED" }, - manifest: { - format: "JSON_ARRAY", - schema: { - columns: [ - { name: "name", type_name: "STRING" }, - { name: "count", type_name: "INT" }, + describe("classic warehouse (EXTERNAL_LINKS + ARROW_STREAM)", () => { + test("returns statement_id for external links fetch", () => { + const connector = createConnector(); + // Real response shape from classic warehouse: EXTERNAL_LINKS + ARROW_STREAM + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { + columns: [ + { name: "test_col", type_name: "INT" }, + { name: "test_col2", type_name: "INT" }, + ], + }, + }, + result: { + external_links: [ + { + external_link: "https://storage.example.com/chunk0", + expiration: "2026-04-15T00:00:00Z", + }, ], }, - }, - result: { - data_array: [ - ["Alice", "10"], - ["Bob", "20"], - ], - }, - } as unknown as sql.StatementResponse; - - const result = (connector as any)._transformDataArray(response); - expect(result.result.data).toEqual([ - { name: "Alice", count: "10" }, - { name: "Bob", count: "20" }, - ]); + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result.result.statement_id).toBe("stmt-1"); + expect(result.result.data).toBeUndefined(); + }); + }); + + describe("serverless warehouse (INLINE + ARROW_STREAM with attachment)", () => { + test("decodes base64 Arrow IPC attachment into row objects", () => { + const connector = createConnector(); + // Real response shape from serverless warehouse: INLINE + ARROW_STREAM + // Data arrives in result.attachment as base64-encoded Arrow IPC, not data_array. + const response = { + statement_id: "00000001-test-stmt", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { + column_count: 2, + columns: [ + { + name: "test_col", + type_text: "INT", + type_name: "INT", + position: 0, + }, + { + name: "test_col2", + type_text: "INT", + type_name: "INT", + position: 1, + }, + ], + total_chunk_count: 1, + chunks: [{ chunk_index: 0, row_offset: 0, row_count: 1 }], + total_row_count: 1, + }, + truncated: false, + }, + result: { + chunk_index: 0, + row_offset: 0, + row_count: 1, + attachment: REAL_ARROW_ATTACHMENT, + }, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result.result.data).toEqual([{ test_col: 1, test_col2: 2 }]); + expect(result.result.attachment).toBeUndefined(); + // Preserves other result fields + expect(result.result.row_count).toBe(1); + }); + + test("preserves manifest and status alongside decoded data", () => { + const connector = createConnector(); + const response = { + statement_id: "00000001-test-stmt", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { + columns: [ + { name: "test_col", type_name: "INT" }, + { name: "test_col2", type_name: "INT" }, + ], + }, + }, + result: { + chunk_index: 0, + row_count: 1, + attachment: REAL_ARROW_ATTACHMENT, + }, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + // Manifest and statement_id are preserved + expect(result.manifest.format).toBe("ARROW_STREAM"); + expect(result.statement_id).toBe("00000001-test-stmt"); + }); }); - test("parses JSON strings in STRING columns for ARROW_STREAM + INLINE", () => { - const connector = createConnector(); - const response = { - statement_id: "stmt-1", - status: { state: "SUCCEEDED" }, - manifest: { - format: "ARROW_STREAM", - schema: { - columns: [ - { name: "id", type_name: "INT" }, - { name: "metadata", type_name: "STRING" }, + describe("ARROW_STREAM with data_array (hypothetical inline variant)", () => { + test("transforms data_array like JSON_ARRAY path", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { + columns: [ + { name: "id", type_name: "INT" }, + { name: "value", type_name: "STRING" }, + ], + }, + }, + result: { + data_array: [ + ["1", "hello"], + ["2", "world"], ], }, - }, - result: { - data_array: [["1", '{"key":"value"}']], - }, - } as unknown as sql.StatementResponse; - - const result = (connector as any)._transformDataArray(response); - expect(result.result.data[0].metadata).toEqual({ key: "value" }); + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result.result.data).toEqual([ + { id: "1", value: "hello" }, + { id: "2", value: "world" }, + ]); + }); }); - test("returns response unchanged when no data_array or schema", () => { - const connector = createConnector(); - const response = { - statement_id: "stmt-1", - status: { state: "SUCCEEDED" }, - manifest: { format: "JSON_ARRAY" }, - result: {}, - } as unknown as sql.StatementResponse; - - const result = (connector as any)._transformDataArray(response); - expect(result).toBe(response); + describe("edge cases", () => { + test("returns response unchanged when no data_array, attachment, or schema", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { format: "JSON_ARRAY" }, + result: {}, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + expect(result).toBe(response); + }); + + test("attachment takes priority over data_array when both present", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-1", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { + columns: [ + { name: "test_col", type_name: "INT" }, + { name: "test_col2", type_name: "INT" }, + ], + }, + }, + result: { + attachment: REAL_ARROW_ATTACHMENT, + data_array: [["999", "999"]], + }, + } as unknown as sql.StatementResponse; + + const result = (connector as any)._transformDataArray(response); + // Should use attachment (Arrow IPC), not data_array + expect(result.result.data).toEqual([{ test_col: 1, test_col2: 2 }]); + }); }); }); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9ca11b818..46096f433 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -299,6 +299,9 @@ importers: '@types/semver': specifier: 7.7.1 version: 7.7.1 + apache-arrow: + specifier: 21.1.0 + version: 21.1.0 dotenv: specifier: 16.6.1 version: 16.6.1 @@ -5539,7 +5542,7 @@ packages: basic-ftp@5.0.5: resolution: {integrity: sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==} engines: {node: '>=10.0.0'} - deprecated: Security vulnerability fixed in 5.2.0, please upgrade + deprecated: Security vulnerability fixed in 5.2.1, please upgrade batch@0.6.1: resolution: {integrity: sha512-x+VAiMRL6UPkx+kudNvxTl6hB2XNNCG2r+7wixVfIYwu/2HKRXimwQyaumLjMveWvT2Hkd/cAJw+QBMfJ/EKVw==} @@ -6653,6 +6656,7 @@ packages: dottie@2.0.6: resolution: {integrity: sha512-iGCHkfUc5kFekGiqhe8B/mdaurD+lakO9txNnTvKtA6PISrw86LgqHvRzWYPyoE2Ph5aMIrCw9/uko6XHTKCwA==} + deprecated: Package no longer supported. Contact Support at https://www.npmjs.com/support for more info. drizzle-orm@0.45.1: resolution: {integrity: sha512-Te0FOdKIistGNPMq2jscdqngBRfBpC8uMFVwqjf6gtTVJHIQ/dosgV/CLBU2N4ZJBsXL5savCba9b0YJskKdcA==} From 055cd412b14f09db303d81fb68de8288d2d33834 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Wed, 15 Apr 2026 16:41:11 +0000 Subject: [PATCH 08/17] test: add 147 tests for service-context, stream-registry, genie connector, files plugin New test files covering major coverage gaps: - context/tests/service-context.test.ts (35 tests, 7% -> 100%) - stream/tests/stream-registry.test.ts (34 tests, 32% -> 100%) - connectors/genie/tests/client.test.ts (28 tests, 61% -> 97%) - plugins/files/tests/upload-and-write.test.ts (50 tests, 69% -> 89%) Total: 1566 -> 1713 tests, all passing. Co-authored-by: Isaac --- .../src/connectors/genie/tests/client.test.ts | 786 +++++++++++ .../src/context/tests/service-context.test.ts | 457 ++++++ .../files/tests/upload-and-write.test.ts | 1245 +++++++++++++++++ .../src/stream/tests/stream-registry.test.ts | 582 ++++++++ 4 files changed, 3070 insertions(+) create mode 100644 packages/appkit/src/connectors/genie/tests/client.test.ts create mode 100644 packages/appkit/src/context/tests/service-context.test.ts create mode 100644 packages/appkit/src/plugins/files/tests/upload-and-write.test.ts create mode 100644 packages/appkit/src/stream/tests/stream-registry.test.ts diff --git a/packages/appkit/src/connectors/genie/tests/client.test.ts b/packages/appkit/src/connectors/genie/tests/client.test.ts new file mode 100644 index 000000000..62fc3578d --- /dev/null +++ b/packages/appkit/src/connectors/genie/tests/client.test.ts @@ -0,0 +1,786 @@ +import type { GenieMessage } from "@databricks/sdk-experimental/dist/apis/dashboards"; +import { beforeEach, describe, expect, test, vi } from "vitest"; +import { GenieConnector } from "../client"; +import type { GenieStreamEvent } from "../types"; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +async function collect( + gen: AsyncGenerator, +): Promise { + const events: GenieStreamEvent[] = []; + for await (const event of gen) { + events.push(event); + } + return events; +} + +function makeGenieMessage(overrides: Partial = {}): GenieMessage { + return { + message_id: "msg-1", + conversation_id: "conv-1", + space_id: "space-1", + status: "COMPLETED", + content: "Hello from Genie", + attachments: [], + ...overrides, + } as GenieMessage; +} + +function makeGenieMessageWithQuery( + overrides: Partial = {}, +): GenieMessage { + return makeGenieMessage({ + attachments: [ + { + attachment_id: "att-1", + query: { + title: "Sales Query", + description: "Total sales", + query: "SELECT sum(amount) FROM sales", + statement_id: "stmt-1", + }, + }, + ], + ...overrides, + }); +} + +/** Creates a mock WorkspaceClient with genie methods stubbed. */ +function createMockWorkspaceClient() { + return { + genie: { + startConversation: vi.fn(), + createMessage: vi.fn(), + getMessage: vi.fn(), + listConversationMessages: vi.fn(), + getMessageAttachmentQueryResult: vi.fn(), + }, + } as any; +} + +/** + * Builds a mock waiter whose `.wait()` invokes `onProgress` for each + * progress value, then resolves with the final result. + */ +function createMockWaiter(opts: { + progressValues?: Partial[]; + result: GenieMessage; +}) { + return { + wait: vi.fn().mockImplementation(async (options: any = {}) => { + if (opts.progressValues) { + for (const value of opts.progressValues) { + if (options.onProgress) { + await options.onProgress(value); + } + } + } + return opts.result; + }), + message_id: opts.result.message_id, + conversation_id: opts.result.conversation_id, + }; +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe("GenieConnector", () => { + let connector: GenieConnector; + let ws: ReturnType; + + beforeEach(() => { + connector = new GenieConnector({ timeout: 0 }); + ws = createMockWorkspaceClient(); + }); + + // ----------------------------------------------------------------------- + // streamSendMessage + // ----------------------------------------------------------------------- + + describe("streamSendMessage", () => { + test("yields message_start, status updates, then message_result", async () => { + const completedMsg = makeGenieMessage(); + const waiter = createMockWaiter({ + progressValues: [ + { status: "EXECUTING_QUERY" }, + { status: "COMPLETED" }, + ], + result: completedMsg, + }); + ws.genie.startConversation.mockResolvedValue(waiter); + + const events = await collect( + connector.streamSendMessage( + ws, + "space-1", + "What are sales?", + undefined, + ), + ); + + expect(events[0]).toEqual({ + type: "message_start", + conversationId: "conv-1", + messageId: "msg-1", + spaceId: "space-1", + }); + + const statusEvents = events.filter((e) => e.type === "status"); + expect(statusEvents).toEqual([ + { type: "status", status: "EXECUTING_QUERY" }, + { type: "status", status: "COMPLETED" }, + ]); + + const msgResult = events.find((e) => e.type === "message_result"); + expect(msgResult).toBeDefined(); + expect((msgResult as any).message.messageId).toBe("msg-1"); + }); + + test("new conversation calls startConversation", async () => { + const completedMsg = makeGenieMessage(); + const waiter = createMockWaiter({ result: completedMsg }); + ws.genie.startConversation.mockResolvedValue(waiter); + + await collect( + connector.streamSendMessage(ws, "space-1", "hello", undefined), + ); + + expect(ws.genie.startConversation).toHaveBeenCalledWith({ + space_id: "space-1", + content: "hello", + }); + expect(ws.genie.createMessage).not.toHaveBeenCalled(); + }); + + test("existing conversation calls createMessage", async () => { + const completedMsg = makeGenieMessage(); + const waiter = createMockWaiter({ result: completedMsg }); + ws.genie.createMessage.mockResolvedValue(waiter); + + await collect( + connector.streamSendMessage(ws, "space-1", "hello", "conv-existing"), + ); + + expect(ws.genie.createMessage).toHaveBeenCalledWith({ + space_id: "space-1", + conversation_id: "conv-existing", + content: "hello", + }); + expect(ws.genie.startConversation).not.toHaveBeenCalled(); + }); + + test("emits query_result for attachments with statementIds", async () => { + const completedMsg = makeGenieMessageWithQuery(); + const waiter = createMockWaiter({ result: completedMsg }); + ws.genie.startConversation.mockResolvedValue(waiter); + + const statementResponse = { + manifest: { + schema: { columns: [{ name: "total", type_name: "DOUBLE" }] }, + }, + result: { data_array: [["1234.56"]] }, + }; + ws.genie.getMessageAttachmentQueryResult.mockResolvedValue({ + statement_response: statementResponse, + }); + + const events = await collect( + connector.streamSendMessage(ws, "space-1", "query", undefined), + ); + + const queryResult = events.find((e) => e.type === "query_result"); + expect(queryResult).toEqual({ + type: "query_result", + attachmentId: "att-1", + statementId: "stmt-1", + data: statementResponse, + }); + }); + + test("yields error event on SDK failure", async () => { + ws.genie.startConversation.mockRejectedValue( + new Error("Network timeout"), + ); + + const events = await collect( + connector.streamSendMessage(ws, "space-1", "hello", undefined), + ); + + expect(events).toEqual([{ type: "error", error: "Network timeout" }]); + }); + + test("classifies RESOURCE_DOES_NOT_EXIST as access denied", async () => { + ws.genie.startConversation.mockRejectedValue( + new Error("RESOURCE_DOES_NOT_EXIST: space not found"), + ); + + const events = await collect( + connector.streamSendMessage(ws, "space-1", "hello", undefined), + ); + + expect(events).toEqual([ + { + type: "error", + error: "You don't have access to this Genie Space.", + }, + ]); + }); + + test("emits error event when query result fetch fails", async () => { + const completedMsg = makeGenieMessageWithQuery(); + const waiter = createMockWaiter({ result: completedMsg }); + ws.genie.startConversation.mockResolvedValue(waiter); + ws.genie.getMessageAttachmentQueryResult.mockRejectedValue( + new Error("statement expired"), + ); + + const events = await collect( + connector.streamSendMessage(ws, "space-1", "query", undefined), + ); + + const errorEvent = events.find((e) => e.type === "error"); + expect(errorEvent).toEqual({ + type: "error", + error: "Failed to fetch query result for attachment att-1", + }); + }); + }); + + // ----------------------------------------------------------------------- + // streamConversation + // ----------------------------------------------------------------------- + + describe("streamConversation", () => { + test("yields message_result for each message, then history_info", async () => { + ws.genie.listConversationMessages.mockResolvedValue({ + messages: [ + makeGenieMessage({ message_id: "m1", content: "first" }), + makeGenieMessage({ message_id: "m2", content: "second" }), + ], + next_page_token: null, + }); + + const events = await collect( + connector.streamConversation(ws, "space-1", "conv-1", { + includeQueryResults: false, + }), + ); + + const messageResults = events.filter((e) => e.type === "message_result"); + expect(messageResults).toHaveLength(2); + + const historyInfo = events.find((e) => e.type === "history_info"); + expect(historyInfo).toEqual({ + type: "history_info", + conversationId: "conv-1", + spaceId: "space-1", + nextPageToken: null, + loadedCount: 2, + }); + }); + + test("fetches query results in parallel when includeQueryResults=true", async () => { + ws.genie.listConversationMessages.mockResolvedValue({ + messages: [ + makeGenieMessageWithQuery({ + message_id: "m1", + attachments: [ + { + attachment_id: "att-a", + query: { + title: "Q1", + query: "SELECT 1", + statement_id: "stmt-a", + }, + }, + { + attachment_id: "att-b", + query: { + title: "Q2", + query: "SELECT 2", + statement_id: "stmt-b", + }, + }, + ], + }), + ], + next_page_token: null, + }); + + const stmtResponse = { + manifest: { schema: { columns: [] } }, + result: { data_array: [] }, + }; + ws.genie.getMessageAttachmentQueryResult.mockResolvedValue({ + statement_response: stmtResponse, + }); + + const events = await collect( + connector.streamConversation(ws, "space-1", "conv-1", { + includeQueryResults: true, + }), + ); + + const queryResults = events.filter((e) => e.type === "query_result"); + expect(queryResults).toHaveLength(2); + expect(ws.genie.getMessageAttachmentQueryResult).toHaveBeenCalledTimes(2); + }); + + test("skips query results when includeQueryResults=false", async () => { + ws.genie.listConversationMessages.mockResolvedValue({ + messages: [makeGenieMessageWithQuery()], + next_page_token: null, + }); + + const events = await collect( + connector.streamConversation(ws, "space-1", "conv-1", { + includeQueryResults: false, + }), + ); + + expect(events.filter((e) => e.type === "query_result")).toHaveLength(0); + expect(ws.genie.getMessageAttachmentQueryResult).not.toHaveBeenCalled(); + }); + + test("handles partial query result failures via Promise.allSettled", async () => { + ws.genie.listConversationMessages.mockResolvedValue({ + messages: [ + makeGenieMessage({ + message_id: "m1", + attachments: [ + { + attachment_id: "att-ok", + query: { + title: "OK", + query: "SELECT 1", + statement_id: "stmt-ok", + }, + }, + { + attachment_id: "att-fail", + query: { + title: "Fail", + query: "SELECT 2", + statement_id: "stmt-fail", + }, + }, + ], + }), + ], + next_page_token: null, + }); + + const stmtResponse = { + manifest: { schema: { columns: [] } }, + result: { data_array: [] }, + }; + + ws.genie.getMessageAttachmentQueryResult + .mockResolvedValueOnce({ statement_response: stmtResponse }) + .mockRejectedValueOnce(new Error("statement expired")); + + const events = await collect( + connector.streamConversation(ws, "space-1", "conv-1", { + includeQueryResults: true, + }), + ); + + const queryResults = events.filter((e) => e.type === "query_result"); + expect(queryResults).toHaveLength(1); + + const errors = events.filter((e) => e.type === "error"); + expect(errors).toHaveLength(1); + expect((errors[0] as any).error).toBe("statement expired"); + }); + + test("yields error when listConversationMessages fails", async () => { + ws.genie.listConversationMessages.mockRejectedValue( + new Error("RESOURCE_DOES_NOT_EXIST: conv not found"), + ); + + const events = await collect( + connector.streamConversation(ws, "space-1", "conv-1"), + ); + + expect(events).toEqual([ + { + type: "error", + error: "You don't have access to this Genie Space.", + }, + ]); + }); + }); + + // ----------------------------------------------------------------------- + // streamGetMessage + // ----------------------------------------------------------------------- + + describe("streamGetMessage", () => { + test("polls until COMPLETED, yields status + message_result", async () => { + ws.genie.getMessage + .mockResolvedValueOnce(makeGenieMessage({ status: "EXECUTING_QUERY" })) + .mockResolvedValueOnce(makeGenieMessage({ status: "COMPLETED" })); + + const events = await collect( + connector.streamGetMessage(ws, "space-1", "conv-1", "msg-1", { + pollInterval: 0, + }), + ); + + expect(events[0]).toEqual({ + type: "status", + status: "EXECUTING_QUERY", + }); + expect(events[1]).toEqual({ type: "status", status: "COMPLETED" }); + expect(events[2]).toMatchObject({ type: "message_result" }); + expect(ws.genie.getMessage).toHaveBeenCalledTimes(2); + }); + + test("polls until FAILED, yields status + message_result", async () => { + ws.genie.getMessage + .mockResolvedValueOnce(makeGenieMessage({ status: "EXECUTING_QUERY" })) + .mockResolvedValueOnce( + makeGenieMessage({ + status: "FAILED", + error: { error: "query timed out" }, + }), + ); + + const events = await collect( + connector.streamGetMessage(ws, "space-1", "conv-1", "msg-1", { + pollInterval: 0, + }), + ); + + const statusEvents = events.filter((e) => e.type === "status"); + expect(statusEvents).toEqual([ + { type: "status", status: "EXECUTING_QUERY" }, + { type: "status", status: "FAILED" }, + ]); + + const msgResult = events.find((e) => e.type === "message_result") as any; + expect(msgResult.message.status).toBe("FAILED"); + expect(msgResult.message.error).toBe("query timed out"); + }); + + test("respects abort signal", async () => { + const controller = new AbortController(); + + ws.genie.getMessage.mockResolvedValue( + makeGenieMessage({ status: "EXECUTING_QUERY" }), + ); + + const gen = connector.streamGetMessage(ws, "space-1", "conv-1", "msg-1", { + pollInterval: 50, + signal: controller.signal, + }); + + const events: GenieStreamEvent[] = []; + // Collect the first status event, then abort + for await (const event of gen) { + events.push(event); + if (events.length === 1) { + controller.abort(); + } + } + + // Should have stopped after abort - at most 2 events + // (the status from poll 1, and possibly status from poll 2 that was already in-flight) + expect(events.length).toBeLessThanOrEqual(2); + expect(events[0]).toEqual({ + type: "status", + status: "EXECUTING_QUERY", + }); + }); + + test("yields error when getMessage throws", async () => { + ws.genie.getMessage.mockRejectedValue(new Error("service unavailable")); + + const events = await collect( + connector.streamGetMessage(ws, "space-1", "conv-1", "msg-1", { + pollInterval: 0, + }), + ); + + expect(events).toEqual([{ type: "error", error: "service unavailable" }]); + }); + + test("does not duplicate status events for same status", async () => { + ws.genie.getMessage + .mockResolvedValueOnce(makeGenieMessage({ status: "EXECUTING_QUERY" })) + .mockResolvedValueOnce(makeGenieMessage({ status: "EXECUTING_QUERY" })) + .mockResolvedValueOnce(makeGenieMessage({ status: "COMPLETED" })); + + const events = await collect( + connector.streamGetMessage(ws, "space-1", "conv-1", "msg-1", { + pollInterval: 0, + }), + ); + + const statusEvents = events.filter((e) => e.type === "status"); + expect(statusEvents).toEqual([ + { type: "status", status: "EXECUTING_QUERY" }, + { type: "status", status: "COMPLETED" }, + ]); + }); + }); + + // ----------------------------------------------------------------------- + // sendMessage + // ----------------------------------------------------------------------- + + describe("sendMessage", () => { + test("returns completed message response", async () => { + const completedMsg = makeGenieMessage({ + message_id: "msg-42", + conversation_id: "conv-new", + }); + const waiter = createMockWaiter({ result: completedMsg }); + ws.genie.startConversation.mockResolvedValue(waiter); + + const result = await connector.sendMessage( + ws, + "space-1", + "What are sales?", + undefined, + ); + + expect(result.messageId).toBe("msg-42"); + expect(result.conversationId).toBe("conv-new"); + expect(result.status).toBe("COMPLETED"); + }); + }); + + // ----------------------------------------------------------------------- + // getConversation + // ----------------------------------------------------------------------- + + describe("getConversation", () => { + test("paginates through all pages", async () => { + // listConversationMessages reverses the SDK response, so mock data + // is ordered newest-first (as the SDK returns) and results are + // oldest-first after reversal. + ws.genie.listConversationMessages + .mockResolvedValueOnce({ + messages: [ + makeGenieMessage({ message_id: "m2" }), + makeGenieMessage({ message_id: "m1" }), + ], + next_page_token: "page2", + }) + .mockResolvedValueOnce({ + messages: [makeGenieMessage({ message_id: "m3" })], + next_page_token: null, + }); + + const result = await connector.getConversation(ws, "space-1", "conv-1"); + + expect(result.messages).toHaveLength(3); + expect(result.messages.map((m) => m.messageId)).toEqual([ + "m1", + "m2", + "m3", + ]); + expect(ws.genie.listConversationMessages).toHaveBeenCalledTimes(2); + }); + + test("respects maxMessages limit", async () => { + const smallConnector = new GenieConnector({ + timeout: 0, + maxMessages: 2, + }); + + ws.genie.listConversationMessages.mockResolvedValueOnce({ + messages: [ + makeGenieMessage({ message_id: "m1" }), + makeGenieMessage({ message_id: "m2" }), + makeGenieMessage({ message_id: "m3" }), + ], + next_page_token: "page2", + }); + + const result = await smallConnector.getConversation( + ws, + "space-1", + "conv-1", + ); + + // Should be sliced to maxMessages + expect(result.messages).toHaveLength(2); + // Should NOT fetch a second page since length already >= maxMessages + expect(ws.genie.listConversationMessages).toHaveBeenCalledTimes(1); + }); + }); + + // ----------------------------------------------------------------------- + // mapAttachments (tested indirectly via toMessageResponse) + // ----------------------------------------------------------------------- + + describe("mapAttachments", () => { + test("handles query attachments", async () => { + const msg = makeGenieMessageWithQuery(); + const waiter = createMockWaiter({ result: msg }); + ws.genie.startConversation.mockResolvedValue(waiter); + + // We drive through streamSendMessage to exercise mapAttachments + ws.genie.getMessageAttachmentQueryResult.mockResolvedValue({ + statement_response: { + manifest: { schema: { columns: [] } }, + result: { data_array: [] }, + }, + }); + + const events = await collect( + connector.streamSendMessage(ws, "space-1", "q", undefined), + ); + + const msgResult = events.find((e) => e.type === "message_result") as any; + expect(msgResult.message.attachments[0]).toEqual({ + attachmentId: "att-1", + query: { + title: "Sales Query", + description: "Total sales", + query: "SELECT sum(amount) FROM sales", + statementId: "stmt-1", + }, + text: undefined, + suggestedQuestions: undefined, + }); + }); + + test("handles text attachments", async () => { + const msg = makeGenieMessage({ + attachments: [ + { + attachment_id: "att-text", + text: { content: "Here is the explanation" }, + }, + ], + }); + const waiter = createMockWaiter({ result: msg }); + ws.genie.startConversation.mockResolvedValue(waiter); + + const events = await collect( + connector.streamSendMessage(ws, "space-1", "q", undefined), + ); + + const msgResult = events.find((e) => e.type === "message_result") as any; + expect(msgResult.message.attachments[0]).toEqual({ + attachmentId: "att-text", + query: undefined, + text: { content: "Here is the explanation" }, + suggestedQuestions: undefined, + }); + }); + + test("handles suggestedQuestions attachments", async () => { + const msg = makeGenieMessage({ + attachments: [ + { + attachment_id: "att-sq", + suggested_questions: { + questions: ["What is X?", "Show me Y"], + }, + }, + ], + }); + const waiter = createMockWaiter({ result: msg }); + ws.genie.startConversation.mockResolvedValue(waiter); + + const events = await collect( + connector.streamSendMessage(ws, "space-1", "q", undefined), + ); + + const msgResult = events.find((e) => e.type === "message_result") as any; + expect(msgResult.message.attachments[0]).toEqual({ + attachmentId: "att-sq", + query: undefined, + text: undefined, + suggestedQuestions: ["What is X?", "Show me Y"], + }); + }); + + test("returns empty array when message has no attachments", async () => { + const msg = makeGenieMessage({ attachments: undefined }); + const waiter = createMockWaiter({ result: msg }); + ws.genie.startConversation.mockResolvedValue(waiter); + + const events = await collect( + connector.streamSendMessage(ws, "space-1", "q", undefined), + ); + + const msgResult = events.find((e) => e.type === "message_result") as any; + expect(msgResult.message.attachments).toEqual([]); + }); + }); + + // ----------------------------------------------------------------------- + // classifyGenieError (tested indirectly via error events) + // ----------------------------------------------------------------------- + + describe("classifyGenieError", () => { + test("maps RESOURCE_DOES_NOT_EXIST to space access denied", async () => { + ws.genie.startConversation.mockRejectedValue( + new Error("RESOURCE_DOES_NOT_EXIST: space xyz"), + ); + + const events = await collect( + connector.streamSendMessage(ws, "space-1", "hi", undefined), + ); + + expect(events[0]).toEqual({ + type: "error", + error: "You don't have access to this Genie Space.", + }); + }); + + test("maps failed-to-reach-COMPLETED + FAILED to table permissions", async () => { + ws.genie.startConversation.mockRejectedValue( + new Error("failed to reach COMPLETED state, got FAILED"), + ); + + const events = await collect( + connector.streamSendMessage(ws, "space-1", "hi", undefined), + ); + + expect(events[0]).toEqual({ + type: "error", + error: + "You may not have access to the data tables. Please verify your table permissions.", + }); + }); + + test("passes through unknown error messages", async () => { + ws.genie.startConversation.mockRejectedValue( + new Error("something unexpected"), + ); + + const events = await collect( + connector.streamSendMessage(ws, "space-1", "hi", undefined), + ); + + expect(events[0]).toEqual({ + type: "error", + error: "something unexpected", + }); + }); + + test("handles non-Error throwable", async () => { + ws.genie.startConversation.mockRejectedValue("string error"); + + const events = await collect( + connector.streamSendMessage(ws, "space-1", "hi", undefined), + ); + + expect(events[0]).toEqual({ + type: "error", + error: "string error", + }); + }); + }); +}); diff --git a/packages/appkit/src/context/tests/service-context.test.ts b/packages/appkit/src/context/tests/service-context.test.ts new file mode 100644 index 000000000..e8610da14 --- /dev/null +++ b/packages/appkit/src/context/tests/service-context.test.ts @@ -0,0 +1,457 @@ +import { setupDatabricksEnv } from "@tools/test-helpers"; +import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; +import { + AuthenticationError, + ConfigurationError, + InitializationError, +} from "../../errors"; +import { ServiceContext } from "../service-context"; + +// ── Mock @databricks/sdk-experimental ────────────────────────────── + +const { mockMe, mockApiRequest, MockWorkspaceClient } = vi.hoisted(() => { + const mockMe = vi.fn(); + const mockApiRequest = vi.fn(); + + const MockWorkspaceClient = vi.fn().mockImplementation(() => ({ + currentUser: { me: mockMe }, + apiClient: { request: mockApiRequest }, + })); + + return { mockMe, mockApiRequest, MockWorkspaceClient }; +}); + +vi.mock("@databricks/sdk-experimental", () => ({ + WorkspaceClient: MockWorkspaceClient, +})); + +// ── Helpers ──────────────────────────────────────────────────────── + +function setupDefaultMocks() { + mockMe.mockResolvedValue({ id: "service-user-123" }); + mockApiRequest.mockResolvedValue({ "x-databricks-org-id": "ws-456" }); +} + +// ── Tests ────────────────────────────────────────────────────────── + +describe("ServiceContext", () => { + const originalEnv = { ...process.env }; + + beforeEach(() => { + vi.clearAllMocks(); + ServiceContext.reset(); + setupDatabricksEnv(); + setupDefaultMocks(); + }); + + afterEach(() => { + process.env = { ...originalEnv }; + ServiceContext.reset(); + }); + + // ── initialize() ─────────────────────────────────────────────── + + describe("initialize()", () => { + test("should initialize with a pre-configured client", async () => { + const client = new MockWorkspaceClient() as any; + + const state = await ServiceContext.initialize({}, client); + + expect(state.client).toBe(client); + expect(state.serviceUserId).toBe("service-user-123"); + expect(await state.workspaceId).toBe("ws-456"); + }); + + test("should create a WorkspaceClient when none is provided", async () => { + await ServiceContext.initialize(); + + // The mock constructor is called once internally + expect(MockWorkspaceClient).toHaveBeenCalled(); + }); + + test("should resolve warehouseId when options.warehouseId is true", async () => { + process.env.DATABRICKS_WAREHOUSE_ID = "wh-789"; + + const state = await ServiceContext.initialize({ warehouseId: true }); + + expect(state.warehouseId).toBeDefined(); + expect(await state.warehouseId).toBe("wh-789"); + }); + + test("should not set warehouseId when options.warehouseId is false", async () => { + const state = await ServiceContext.initialize({ warehouseId: false }); + + expect(state.warehouseId).toBeUndefined(); + }); + + test("should not set warehouseId when options are omitted", async () => { + const state = await ServiceContext.initialize(); + + expect(state.warehouseId).toBeUndefined(); + }); + + test("should throw when currentUser.me() returns no id", async () => { + mockMe.mockResolvedValue({}); + + await expect(ServiceContext.initialize()).rejects.toThrow( + ConfigurationError, + ); + }); + + test("should be idempotent - calling twice returns same instance", async () => { + const state1 = await ServiceContext.initialize(); + const state2 = await ServiceContext.initialize(); + + expect(state1).toBe(state2); + }); + + test("concurrent calls return the same promise", async () => { + const p1 = ServiceContext.initialize(); + const p2 = ServiceContext.initialize(); + + const [state1, state2] = await Promise.all([p1, p2]); + + expect(state1).toBe(state2); + // currentUser.me should only be called once regardless of concurrent calls + expect(mockMe).toHaveBeenCalledTimes(1); + }); + }); + + // ── get() ────────────────────────────────────────────────────── + + describe("get()", () => { + test("should throw InitializationError when not initialized", () => { + expect(() => ServiceContext.get()).toThrow(InitializationError); + expect(() => ServiceContext.get()).toThrow( + /ServiceContext not initialized/, + ); + }); + + test("should return state after initialization", async () => { + const state = await ServiceContext.initialize(); + const retrieved = ServiceContext.get(); + + expect(retrieved).toBe(state); + }); + }); + + // ── isInitialized() ──────────────────────────────────────────── + + describe("isInitialized()", () => { + test("should return false before initialization", () => { + expect(ServiceContext.isInitialized()).toBe(false); + }); + + test("should return true after initialization", async () => { + await ServiceContext.initialize(); + + expect(ServiceContext.isInitialized()).toBe(true); + }); + + test("should return false after reset()", async () => { + await ServiceContext.initialize(); + ServiceContext.reset(); + + expect(ServiceContext.isInitialized()).toBe(false); + }); + }); + + // ── createUserContext() ──────────────────────────────────────── + + describe("createUserContext()", () => { + beforeEach(async () => { + await ServiceContext.initialize({ warehouseId: true }); + }); + + test("should create a user context with correct properties", () => { + const userCtx = ServiceContext.createUserContext( + "user-token-abc", + "user-42", + "Alice", + ); + + expect(userCtx.userId).toBe("user-42"); + expect(userCtx.userName).toBe("Alice"); + expect(userCtx.isUserContext).toBe(true); + expect(userCtx.client).toBeDefined(); + }); + + test("should share warehouseId and workspaceId from service context", async () => { + process.env.DATABRICKS_WAREHOUSE_ID = "wh-shared"; + + // Re-initialize with the new env + ServiceContext.reset(); + mockApiRequest.mockResolvedValue({ "x-databricks-org-id": "ws-shared" }); + await ServiceContext.initialize({ warehouseId: true }); + + const userCtx = ServiceContext.createUserContext("user-token", "user-1"); + + const serviceCtx = ServiceContext.get(); + expect(userCtx.warehouseId).toBe(serviceCtx.warehouseId); + expect(userCtx.workspaceId).toBe(serviceCtx.workspaceId); + }); + + test("should create user client with PAT authType", () => { + ServiceContext.createUserContext("user-token", "user-1"); + + // The last call to MockWorkspaceClient should be for the user client + const lastCall = + MockWorkspaceClient.mock.calls[ + MockWorkspaceClient.mock.calls.length - 1 + ]; + expect(lastCall[0]).toMatchObject({ + token: "user-token", + host: process.env.DATABRICKS_HOST, + authType: "pat", + }); + }); + + test("should handle missing userName gracefully", () => { + const userCtx = ServiceContext.createUserContext("user-token", "user-1"); + + expect(userCtx.userName).toBeUndefined(); + }); + + test("should throw AuthenticationError on missing token", () => { + expect(() => ServiceContext.createUserContext("", "user-1")).toThrow( + AuthenticationError, + ); + }); + + test("should throw ConfigurationError when DATABRICKS_HOST is not set", () => { + delete process.env.DATABRICKS_HOST; + + expect(() => ServiceContext.createUserContext("token", "user-1")).toThrow( + ConfigurationError, + ); + }); + + test("should throw InitializationError when service context is not initialized", () => { + ServiceContext.reset(); + + expect(() => ServiceContext.createUserContext("token", "user-1")).toThrow( + InitializationError, + ); + }); + }); + + // ── reset() ──────────────────────────────────────────────────── + + describe("reset()", () => { + test("should clear the singleton state", async () => { + await ServiceContext.initialize(); + expect(ServiceContext.isInitialized()).toBe(true); + + ServiceContext.reset(); + + expect(ServiceContext.isInitialized()).toBe(false); + expect(() => ServiceContext.get()).toThrow(InitializationError); + }); + + test("should allow re-initialization after reset", async () => { + await ServiceContext.initialize(); + ServiceContext.reset(); + + mockMe.mockResolvedValue({ id: "new-service-user" }); + const state = await ServiceContext.initialize(); + + expect(state.serviceUserId).toBe("new-service-user"); + }); + }); + + // ── getWorkspaceId() (private, tested via initialize) ───────── + + describe("getWorkspaceId()", () => { + test("should use DATABRICKS_WORKSPACE_ID env var when set", async () => { + process.env.DATABRICKS_WORKSPACE_ID = "env-ws-123"; + + const state = await ServiceContext.initialize(); + + expect(await state.workspaceId).toBe("env-ws-123"); + // Should not call the SCIM API when env var is set + expect(mockApiRequest).not.toHaveBeenCalledWith( + expect.objectContaining({ path: "/api/2.0/preview/scim/v2/Me" }), + ); + }); + + test("should call SCIM API when env var is not set", async () => { + delete process.env.DATABRICKS_WORKSPACE_ID; + mockApiRequest.mockResolvedValue({ + "x-databricks-org-id": "scim-ws-789", + }); + + const state = await ServiceContext.initialize(); + + expect(await state.workspaceId).toBe("scim-ws-789"); + expect(mockApiRequest).toHaveBeenCalledWith( + expect.objectContaining({ + path: "/api/2.0/preview/scim/v2/Me", + method: "GET", + responseHeaders: ["x-databricks-org-id"], + }), + ); + }); + + test("should throw when SCIM API returns no workspace ID", async () => { + delete process.env.DATABRICKS_WORKSPACE_ID; + mockApiRequest.mockResolvedValue({}); + + const state = await ServiceContext.initialize(); + + await expect(state.workspaceId).rejects.toThrow(ConfigurationError); + }); + }); + + // ── getWarehouseId() (private, tested via initialize) ───────── + + describe("getWarehouseId()", () => { + test("should use DATABRICKS_WAREHOUSE_ID env var when set", async () => { + process.env.DATABRICKS_WAREHOUSE_ID = "env-wh-abc"; + + const state = await ServiceContext.initialize({ warehouseId: true }); + + expect(await state.warehouseId).toBe("env-wh-abc"); + }); + + test("should auto-discover warehouse in development mode", async () => { + delete process.env.DATABRICKS_WAREHOUSE_ID; + process.env.NODE_ENV = "development"; + + mockApiRequest.mockImplementation(({ path }: { path: string }) => { + if (path === "/api/2.0/sql/warehouses") { + return Promise.resolve({ + warehouses: [ + { id: "wh-stopped", state: "STOPPED" }, + { id: "wh-running", state: "RUNNING" }, + { id: "wh-starting", state: "STARTING" }, + ], + }); + } + // SCIM response for workspaceId + return Promise.resolve({ "x-databricks-org-id": "ws-dev" }); + }); + + const state = await ServiceContext.initialize({ warehouseId: true }); + + // Should pick RUNNING warehouse (highest priority) + expect(await state.warehouseId).toBe("wh-running"); + }); + + test("should sort warehouses by state priority in dev mode", async () => { + delete process.env.DATABRICKS_WAREHOUSE_ID; + process.env.NODE_ENV = "development"; + + mockApiRequest.mockImplementation(({ path }: { path: string }) => { + if (path === "/api/2.0/sql/warehouses") { + return Promise.resolve({ + warehouses: [ + { id: "wh-stopping", state: "STOPPING" }, + { id: "wh-starting", state: "STARTING" }, + { id: "wh-stopped", state: "STOPPED" }, + ], + }); + } + return Promise.resolve({ "x-databricks-org-id": "ws-dev" }); + }); + + const state = await ServiceContext.initialize({ warehouseId: true }); + + // STOPPED (priority 1) < STARTING (priority 2) < STOPPING (priority 3) + expect(await state.warehouseId).toBe("wh-stopped"); + }); + + test("should throw in dev mode when no warehouses are available", async () => { + delete process.env.DATABRICKS_WAREHOUSE_ID; + process.env.NODE_ENV = "development"; + + mockApiRequest.mockImplementation(({ path }: { path: string }) => { + if (path === "/api/2.0/sql/warehouses") { + return Promise.resolve({ warehouses: [] }); + } + return Promise.resolve({ "x-databricks-org-id": "ws-dev" }); + }); + + const state = await ServiceContext.initialize({ warehouseId: true }); + + await expect(state.warehouseId).rejects.toThrow(ConfigurationError); + }); + + test("should throw in dev mode when all warehouses are deleted", async () => { + delete process.env.DATABRICKS_WAREHOUSE_ID; + process.env.NODE_ENV = "development"; + + mockApiRequest.mockImplementation(({ path }: { path: string }) => { + if (path === "/api/2.0/sql/warehouses") { + return Promise.resolve({ + warehouses: [ + { id: "wh-deleted", state: "DELETED" }, + { id: "wh-deleting", state: "DELETING" }, + ], + }); + } + return Promise.resolve({ "x-databricks-org-id": "ws-dev" }); + }); + + const state = await ServiceContext.initialize({ warehouseId: true }); + + await expect(state.warehouseId).rejects.toThrow(ConfigurationError); + }); + + test("should throw in dev mode when best warehouse has no id", async () => { + delete process.env.DATABRICKS_WAREHOUSE_ID; + process.env.NODE_ENV = "development"; + + mockApiRequest.mockImplementation(({ path }: { path: string }) => { + if (path === "/api/2.0/sql/warehouses") { + return Promise.resolve({ + warehouses: [{ state: "RUNNING" }], + }); + } + return Promise.resolve({ "x-databricks-org-id": "ws-dev" }); + }); + + const state = await ServiceContext.initialize({ warehouseId: true }); + + await expect(state.warehouseId).rejects.toThrow(ConfigurationError); + }); + + test("should throw in production when DATABRICKS_WAREHOUSE_ID is not set", async () => { + delete process.env.DATABRICKS_WAREHOUSE_ID; + process.env.NODE_ENV = "production"; + + const state = await ServiceContext.initialize({ warehouseId: true }); + + await expect(state.warehouseId).rejects.toThrow(ConfigurationError); + await expect(state.warehouseId).rejects.toThrow( + /DATABRICKS_WAREHOUSE_ID/, + ); + }); + }); + + // ── getClientOptions() ───────────────────────────────────────── + + describe("getClientOptions()", () => { + test("should return product name and version", () => { + const options = ServiceContext.getClientOptions(); + + expect(options.product).toBe("@databricks/appkit"); + expect(options.productVersion).toBeDefined(); + }); + + test("should include dev mode user agent extra in development", () => { + process.env.NODE_ENV = "development"; + + const options = ServiceContext.getClientOptions(); + + expect(options.userAgentExtra).toEqual({ mode: "dev" }); + }); + + test("should not include dev mode user agent extra in production", () => { + process.env.NODE_ENV = "production"; + + const options = ServiceContext.getClientOptions(); + + expect(options.userAgentExtra).toBeUndefined(); + }); + }); +}); diff --git a/packages/appkit/src/plugins/files/tests/upload-and-write.test.ts b/packages/appkit/src/plugins/files/tests/upload-and-write.test.ts new file mode 100644 index 000000000..8da3f021c --- /dev/null +++ b/packages/appkit/src/plugins/files/tests/upload-and-write.test.ts @@ -0,0 +1,1245 @@ +import { Readable } from "node:stream"; +import { mockServiceContext, setupDatabricksEnv } from "@tools/test-helpers"; +import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; +import { ServiceContext } from "../../../context/service-context"; +import { AuthenticationError } from "../../../errors"; +import { FilesPlugin } from "../plugin"; + +const { mockClient, MockApiError, mockCacheInstance } = vi.hoisted(() => { + const mockFilesApi = { + listDirectoryContents: vi.fn(), + download: vi.fn(), + getMetadata: vi.fn(), + upload: vi.fn(), + createDirectory: vi.fn(), + delete: vi.fn(), + }; + + const mockClient = { + files: mockFilesApi, + config: { + host: "https://test.databricks.com", + authenticate: vi.fn(), + }, + }; + + class MockApiError extends Error { + statusCode: number; + constructor(message: string, statusCode: number) { + super(message); + this.name = "ApiError"; + this.statusCode = statusCode; + } + } + + const mockCacheInstance = { + get: vi.fn(), + set: vi.fn(), + delete: vi.fn(), + getOrExecute: vi.fn(async (_key: unknown[], fn: () => Promise) => + fn(), + ), + generateKey: vi.fn((...args: unknown[]) => JSON.stringify(args)), + }; + + return { mockFilesApi, mockClient, MockApiError, mockCacheInstance }; +}); + +vi.mock("@databricks/sdk-experimental", () => ({ + WorkspaceClient: vi.fn(() => mockClient), + ApiError: MockApiError, +})); + +vi.mock("../../../context", async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + getWorkspaceClient: vi.fn(() => mockClient), + isInUserContext: vi.fn(() => true), + }; +}); + +vi.mock("../../../cache", () => ({ + CacheManager: { + getInstanceSync: vi.fn(() => mockCacheInstance), + }, +})); + +const VOLUMES_CONFIG = { + volumes: { + uploads: { maxUploadSize: 100_000_000 }, + exports: {}, + }, +}; + +/** + * Helper to get a route handler from the plugin. Registers routes on a mock + * router and returns the handler matching the given method + path suffix. + */ +function getRouteHandler( + plugin: FilesPlugin, + method: "get" | "post" | "delete", + pathSuffix: string, +) { + const mockRouter = { + use: vi.fn(), + get: vi.fn(), + post: vi.fn(), + put: vi.fn(), + delete: vi.fn(), + patch: vi.fn(), + } as any; + + plugin.injectRoutes(mockRouter); + + const call = mockRouter[method].mock.calls.find( + (c: unknown[]) => + typeof c[0] === "string" && (c[0] as string).endsWith(pathSuffix), + ); + if (!call) throw new Error(`No route found for ${method} ...${pathSuffix}`); + return call[call.length - 1] as (req: any, res: any) => Promise; +} + +/** + * Creates a mock Express response with all methods needed by the route handlers. + */ +function mockRes() { + const res: any = { + headersSent: false, + }; + res.status = vi.fn().mockReturnValue(res); + res.json = vi.fn().mockReturnValue(res); + res.type = vi.fn().mockReturnValue(res); + res.send = vi.fn().mockReturnValue(res); + res.setHeader = vi.fn().mockReturnValue(res); + res.write = vi.fn().mockReturnValue(true); + res.destroy = vi.fn(); + res.end = vi.fn(); + res.on = vi.fn().mockReturnValue(res); + res.once = vi.fn().mockReturnValue(res); + res.emit = vi.fn().mockReturnValue(true); + res.removeListener = vi.fn().mockReturnValue(res); + res.pipe = vi.fn().mockReturnValue(res); + return res; +} + +/** + * Creates a mock Express request with the auth headers needed by the plugin's + * `asUser()` proxy. + */ +function mockReq(volumeKey: string, overrides: Record = {}): any { + const headers: Record = { + "x-forwarded-access-token": "test-token", + "x-forwarded-user": "test-user", + ...(overrides.headers ?? {}), + }; + + const req: any = { + params: { volumeKey }, + query: {}, + ...overrides, + headers, + header: (name: string) => headers[name.toLowerCase()], + }; + + return req; +} + +/** + * Creates a mock Express request that behaves as a Node Readable stream, + * suitable for the upload handler which calls Readable.toWeb(req). + */ +function mockUploadReq( + volumeKey: string, + bodyChunks: Buffer[], + overrides: Record = {}, +): any { + const headers: Record = { + "x-forwarded-access-token": "test-token", + "x-forwarded-user": "test-user", + ...(overrides.headers ?? {}), + }; + + // Create a real Node Readable so Readable.toWeb() works + let chunkIndex = 0; + const stream = new Readable({ + read() { + if (chunkIndex < bodyChunks.length) { + this.push(bodyChunks[chunkIndex++]); + } else { + this.push(null); + } + }, + }); + + // Patch stream with Express request properties + (stream as any).params = { volumeKey }; + (stream as any).query = overrides.query ?? {}; + (stream as any).headers = headers; + (stream as any).header = (name: string) => headers[name.toLowerCase()]; + (stream as any).body = overrides.body; + + return stream; +} + +describe("FilesPlugin - Upload, Write, and Error Handling", () => { + let serviceContextMock: Awaited>; + + beforeEach(async () => { + vi.clearAllMocks(); + setupDatabricksEnv(); + ServiceContext.reset(); + process.env.DATABRICKS_VOLUME_UPLOADS = "/Volumes/catalog/schema/uploads"; + process.env.DATABRICKS_VOLUME_EXPORTS = "/Volumes/catalog/schema/exports"; + serviceContextMock = await mockServiceContext(); + }); + + afterEach(() => { + serviceContextMock?.restore(); + delete process.env.DATABRICKS_VOLUME_UPLOADS; + delete process.env.DATABRICKS_VOLUME_EXPORTS; + }); + + // ────────────────────────────────────────────────────────────────────── + // 1. _handleApiError: AuthenticationError -> 401, ApiError variants, + // non-ApiError -> 500 + // ────────────────────────────────────────────────────────────────────── + describe("_handleApiError", () => { + test("AuthenticationError returns 401 with error message", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const res = mockRes(); + + (plugin as any)._handleApiError( + res, + new AuthenticationError("Missing token"), + "fallback msg", + ); + + expect(res.status).toHaveBeenCalledWith(401); + expect(res.json).toHaveBeenCalledWith({ + error: "Missing token", + plugin: "files", + }); + }); + + test("ApiError with 4xx status preserves status and message", () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const res = mockRes(); + + (plugin as any)._handleApiError( + res, + new MockApiError("Forbidden", 403), + "fallback msg", + ); + + expect(res.status).toHaveBeenCalledWith(403); + expect(res.json).toHaveBeenCalledWith({ + error: "Forbidden", + statusCode: 403, + plugin: "files", + }); + }); + + test("ApiError with 404 preserves status", () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const res = mockRes(); + + (plugin as any)._handleApiError( + res, + new MockApiError("Not found", 404), + "fallback msg", + ); + + expect(res.status).toHaveBeenCalledWith(404); + expect(res.json).toHaveBeenCalledWith({ + error: "Not found", + statusCode: 404, + plugin: "files", + }); + }); + + test("ApiError with 409 Conflict preserves status", () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const res = mockRes(); + + (plugin as any)._handleApiError( + res, + new MockApiError("Conflict", 409), + "fallback msg", + ); + + expect(res.status).toHaveBeenCalledWith(409); + expect(res.json).toHaveBeenCalledWith({ + error: "Conflict", + statusCode: 409, + plugin: "files", + }); + }); + + test("ApiError with 5xx returns 500 with fallback message", () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const res = mockRes(); + + (plugin as any)._handleApiError( + res, + new MockApiError("Bad Gateway", 502), + "Operation failed", + ); + + expect(res.status).toHaveBeenCalledWith(500); + expect(res.json).toHaveBeenCalledWith({ + error: "Operation failed", + plugin: "files", + }); + }); + + test("ApiError with statusCode 500 returns 500 with fallback", () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const res = mockRes(); + + (plugin as any)._handleApiError( + res, + new MockApiError("Internal error", 500), + "Fallback", + ); + + expect(res.status).toHaveBeenCalledWith(500); + expect(res.json).toHaveBeenCalledWith({ + error: "Fallback", + plugin: "files", + }); + }); + + test("non-ApiError falls back to 500 with fallback message", () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const res = mockRes(); + + (plugin as any)._handleApiError(res, new Error("unknown"), "Fallback"); + + expect(res.status).toHaveBeenCalledWith(500); + expect(res.json).toHaveBeenCalledWith({ + error: "Fallback", + plugin: "files", + }); + }); + + test("non-ApiError exception returns 500 with fallback message", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const res = mockRes(); + + (plugin as any)._handleApiError( + res, + new TypeError("Cannot read properties of undefined"), + "Internal Server Error", + ); + + expect(res.status).toHaveBeenCalledWith(500); + expect(res.json).toHaveBeenCalledWith({ + error: "Internal Server Error", + plugin: "files", + }); + }); + + test("AuthenticationError via route (missing token in production)", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/list"); + const res = mockRes(); + + const originalEnv = process.env.NODE_ENV; + process.env.NODE_ENV = "production"; + + try { + await handler( + { + params: { volumeKey: "uploads" }, + query: {}, + headers: {}, + header: () => undefined, + }, + res, + ); + + expect(res.status).toHaveBeenCalledWith(401); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ + error: expect.stringContaining("token"), + plugin: "files", + }), + ); + } finally { + process.env.NODE_ENV = originalEnv; + } + }); + }); + + // ────────────────────────────────────────────────────────────────────── + // 2. Upload path: TransformStream size enforcement during streaming + // ────────────────────────────────────────────────────────────────────── + describe("Upload stream mid-transfer size enforcement", () => { + test("upload exceeding size mid-stream is caught by execute and returns error", async () => { + const plugin = new FilesPlugin({ + volumes: { + uploads: { maxUploadSize: 50 }, + }, + }); + const handler = getRouteHandler(plugin, "post", "/upload"); + const res = mockRes(); + + // Two chunks: 30 + 30 = 60 > maxSize of 50 + const req = mockUploadReq( + "uploads", + [Buffer.alloc(30), Buffer.alloc(30)], + { + query: { path: "/Volumes/catalog/schema/uploads/file.bin" }, + // No content-length header so the pre-check does not catch it + }, + ); + + // Spy on the connector's upload to consume the stream (the + // TransformStream size limiter fires when chunks are read). + const connector = (plugin as any).volumeConnectors.uploads; + vi.spyOn(connector, "upload").mockImplementation( + async (_client: any, _path: string, contents: any) => { + const reader = (contents as ReadableStream).getReader(); + while (true) { + const { done } = await reader.read(); + if (done) break; + } + }, + ); + + await handler(req, res); + + // The stream size error is caught by execute() and returned as + // {ok: false, status: 500}. The Content-Length pre-check (tested + // separately) catches oversized uploads before streaming starts. + const statusCalls = res.status.mock.calls.flat(); + expect(statusCalls).toContain(500); + }); + + test("outer catch returns 413 for stream size error escaping execute", async () => { + // The outer catch in _handleUpload has a specific check for the + // "exceeds maximum allowed size" message. This tests that path by + // making execute() re-throw instead of catching. + const plugin = new FilesPlugin({ + volumes: { + uploads: { maxUploadSize: 50 }, + }, + }); + const handler = getRouteHandler(plugin, "post", "/upload"); + const res = mockRes(); + + const req = mockUploadReq("uploads", [Buffer.from("data")], { + query: { path: "/Volumes/catalog/schema/uploads/file.bin" }, + }); + + // Override trackWrite to throw the size error directly + vi.spyOn(plugin as any, "trackWrite").mockRejectedValue( + new Error("Upload stream exceeds maximum allowed size (50 bytes)"), + ); + + await handler(req, res); + + expect(res.status).toHaveBeenCalledWith(413); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ + error: expect.stringContaining("exceeds maximum allowed size"), + plugin: "files", + }), + ); + }); + + test("upload within size limit succeeds", async () => { + const plugin = new FilesPlugin({ + volumes: { + uploads: { maxUploadSize: 100 }, + }, + }); + const handler = getRouteHandler(plugin, "post", "/upload"); + const res = mockRes(); + + const req = mockUploadReq( + "uploads", + [Buffer.from("small file content")], + { + query: { path: "/Volumes/catalog/schema/uploads/small.txt" }, + }, + ); + + const connector = (plugin as any).volumeConnectors.uploads; + vi.spyOn(connector, "upload").mockImplementation( + async (_client: any, _path: string, contents: any) => { + const reader = (contents as ReadableStream).getReader(); + while (true) { + const { done } = await reader.read(); + if (done) break; + } + }, + ); + + await handler(req, res); + + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ success: true }), + ); + }); + }); + + // ────────────────────────────────────────────────────────────────────── + // 3. Upload: cache invalidation after successful upload + // ────────────────────────────────────────────────────────────────────── + describe("Upload cache invalidation", () => { + test("successful upload calls cache.delete for parent directory", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "post", "/upload"); + const res = mockRes(); + + const req = mockUploadReq("uploads", [Buffer.from("file content")], { + query: { path: "/Volumes/catalog/schema/uploads/dir/file.txt" }, + }); + + const connector = (plugin as any).volumeConnectors.uploads; + vi.spyOn(connector, "upload").mockImplementation( + async (_client: any, _path: string, contents: any) => { + const reader = (contents as ReadableStream).getReader(); + while (true) { + const { done } = await reader.read(); + if (done) break; + } + }, + ); + + await handler(req, res); + + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ success: true }), + ); + // _invalidateListCache should call generateKey and then delete + expect(mockCacheInstance.generateKey).toHaveBeenCalled(); + expect(mockCacheInstance.delete).toHaveBeenCalled(); + }); + }); + + // ────────────────────────────────────────────────────────────────────── + // 4. Raw endpoint: CSP sandbox header and safe vs unsafe content type + // ────────────────────────────────────────────────────────────────────── + describe("Raw endpoint security headers", () => { + function makeStreamResponse(content: string) { + const stream = new ReadableStream({ + start(controller) { + controller.enqueue(new TextEncoder().encode(content)); + controller.close(); + }, + }); + return { contents: stream }; + } + + test("raw endpoint sets CSP sandbox header", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/raw"); + const res = mockRes(); + + mockClient.files.download.mockResolvedValue(makeStreamResponse("data")); + + await handler( + mockReq("uploads", { + query: { path: "/Volumes/catalog/schema/uploads/data.json" }, + }), + res, + ); + + expect(res.setHeader).toHaveBeenCalledWith( + "Content-Security-Policy", + "sandbox", + ); + }); + + test("raw endpoint with safe content type (image/png) does not set Content-Disposition", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/raw"); + const res = mockRes(); + + mockClient.files.download.mockResolvedValue( + makeStreamResponse("PNG data"), + ); + + await handler( + mockReq("uploads", { + query: { path: "/Volumes/catalog/schema/uploads/image.png" }, + }), + res, + ); + + expect(res.setHeader).toHaveBeenCalledWith("Content-Type", "image/png"); + expect(res.setHeader).toHaveBeenCalledWith( + "Content-Security-Policy", + "sandbox", + ); + + // Content-Disposition should NOT be set for safe inline types + const dispositionCalls = res.setHeader.mock.calls.filter( + (c: string[]) => c[0] === "Content-Disposition", + ); + expect(dispositionCalls).toHaveLength(0); + }); + + test("raw endpoint with unsafe content type (text/html) forces download", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/raw"); + const res = mockRes(); + + mockClient.files.download.mockResolvedValue( + makeStreamResponse(""), + ); + + await handler( + mockReq("uploads", { + query: { path: "/Volumes/catalog/schema/uploads/page.html" }, + }), + res, + ); + + expect(res.setHeader).toHaveBeenCalledWith("Content-Type", "text/html"); + expect(res.setHeader).toHaveBeenCalledWith( + "Content-Security-Policy", + "sandbox", + ); + expect(res.setHeader).toHaveBeenCalledWith( + "Content-Disposition", + 'attachment; filename="page.html"', + ); + }); + + test("raw endpoint with SVG forces download", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/raw"); + const res = mockRes(); + + mockClient.files.download.mockResolvedValue( + makeStreamResponse(""), + ); + + await handler( + mockReq("uploads", { + query: { path: "/Volumes/catalog/schema/uploads/icon.svg" }, + }), + res, + ); + + expect(res.setHeader).toHaveBeenCalledWith( + "Content-Disposition", + 'attachment; filename="icon.svg"', + ); + }); + + test("raw endpoint sets X-Content-Type-Options: nosniff", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/raw"); + const res = mockRes(); + + mockClient.files.download.mockResolvedValue( + makeStreamResponse("content"), + ); + + await handler( + mockReq("uploads", { + query: { path: "/Volumes/catalog/schema/uploads/file.txt" }, + }), + res, + ); + + expect(res.setHeader).toHaveBeenCalledWith( + "X-Content-Type-Options", + "nosniff", + ); + }); + + test("raw endpoint with missing path returns 400", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/raw"); + const res = mockRes(); + + await handler(mockReq("uploads", { query: {} }), res); + + expect(res.status).toHaveBeenCalledWith(400); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ + error: "path is required", + plugin: "files", + }), + ); + }); + }); + + // ────────────────────────────────────────────────────────────────────── + // 5. Download endpoint: Content-Disposition with sanitized filename + // ────────────────────────────────────────────────────────────────────── + describe("Download endpoint Content-Disposition", () => { + function makeStreamResponse(content: string) { + const stream = new ReadableStream({ + start(controller) { + controller.enqueue(new TextEncoder().encode(content)); + controller.close(); + }, + }); + return { contents: stream }; + } + + test("download sets Content-Disposition: attachment with filename", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/download"); + const res = mockRes(); + + mockClient.files.download.mockResolvedValue( + makeStreamResponse("file data"), + ); + + await handler( + mockReq("uploads", { + query: { path: "/Volumes/catalog/schema/uploads/report.pdf" }, + }), + res, + ); + + expect(res.setHeader).toHaveBeenCalledWith( + "Content-Disposition", + 'attachment; filename="report.pdf"', + ); + }); + + test("download sanitizes filename with special characters", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/download"); + const res = mockRes(); + + mockClient.files.download.mockResolvedValue(makeStreamResponse("data")); + + await handler( + mockReq("uploads", { + query: { path: '/Volumes/catalog/schema/uploads/my "file".txt' }, + }), + res, + ); + + // Quotes in filenames should be escaped + expect(res.setHeader).toHaveBeenCalledWith( + "Content-Disposition", + 'attachment; filename="my \\"file\\".txt"', + ); + }); + + test("download always sets Content-Disposition even for safe types", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/download"); + const res = mockRes(); + + mockClient.files.download.mockResolvedValue(makeStreamResponse("{}")); + + await handler( + mockReq("uploads", { + query: { path: "/Volumes/catalog/schema/uploads/data.json" }, + }), + res, + ); + + // Download mode always forces attachment, even for safe types + expect(res.setHeader).toHaveBeenCalledWith( + "Content-Disposition", + 'attachment; filename="data.json"', + ); + }); + + test("download with missing path returns 400", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/download"); + const res = mockRes(); + + await handler(mockReq("uploads", { query: {} }), res); + + expect(res.status).toHaveBeenCalledWith(400); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ + error: "path is required", + plugin: "files", + }), + ); + }); + + test("download with response having no contents calls res.end()", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/download"); + const res = mockRes(); + + // Response with no contents field (empty file) + mockClient.files.download.mockResolvedValue({}); + + await handler( + mockReq("uploads", { + query: { path: "/Volumes/catalog/schema/uploads/empty.txt" }, + }), + res, + ); + + expect(res.end).toHaveBeenCalled(); + }); + }); + + // ────────────────────────────────────────────────────────────────────── + // 6. Delete endpoint: cache invalidation + // ────────────────────────────────────────────────────────────────────── + describe("Delete cache invalidation", () => { + test("successful delete invalidates list cache", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "delete", ""); + const res = mockRes(); + + mockClient.files.delete.mockResolvedValue(undefined); + + await handler( + mockReq("uploads", { + query: { path: "/Volumes/catalog/schema/uploads/dir/file.txt" }, + }), + res, + ); + + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ success: true }), + ); + expect(mockCacheInstance.generateKey).toHaveBeenCalled(); + expect(mockCacheInstance.delete).toHaveBeenCalled(); + }); + + test("delete without path returns 400", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "delete", ""); + const res = mockRes(); + + await handler(mockReq("uploads", { query: {} }), res); + + expect(res.status).toHaveBeenCalledWith(400); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ error: "path is required" }), + ); + }); + + test("delete that throws ApiError returns proper status", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "delete", ""); + const res = mockRes(); + + mockClient.files.delete.mockRejectedValue( + new MockApiError("Not found", 404), + ); + + await handler( + mockReq("uploads", { + query: { path: "/Volumes/catalog/schema/uploads/missing.txt" }, + }), + res, + ); + + // SDK errors go through execute() which returns {ok: false, status: 404} + // then _sendStatusError is called with STATUS_CODES[404] = "Not Found" + expect(res.status).toHaveBeenCalledWith(404); + }); + }); + + // ────────────────────────────────────────────────────────────────────── + // 7. Mkdir endpoint: cache invalidation + // ────────────────────────────────────────────────────────────────────── + describe("Mkdir cache invalidation", () => { + test("successful mkdir invalidates list cache", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "post", "/mkdir"); + const res = mockRes(); + + mockClient.files.createDirectory.mockResolvedValue(undefined); + + await handler( + mockReq("uploads", { + body: { path: "/Volumes/catalog/schema/uploads/newdir" }, + }), + res, + ); + + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ success: true }), + ); + expect(mockCacheInstance.generateKey).toHaveBeenCalled(); + expect(mockCacheInstance.delete).toHaveBeenCalled(); + }); + + test("mkdir without path returns 400", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "post", "/mkdir"); + const res = mockRes(); + + await handler(mockReq("uploads", { body: {} }), res); + + expect(res.status).toHaveBeenCalledWith(400); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ error: "path is required" }), + ); + }); + + test("mkdir that throws ApiError 409 is handled via execute", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "post", "/mkdir"); + const res = mockRes(); + + mockClient.files.createDirectory.mockRejectedValue( + new MockApiError("Conflict", 409), + ); + + await handler( + mockReq("uploads", { + body: { path: "/Volumes/catalog/schema/uploads/existing" }, + }), + res, + ); + + // SDK errors go through execute() -> _sendStatusError with status 409 + expect(res.status).toHaveBeenCalledWith(409); + }); + }); + + // ────────────────────────────────────────────────────────────────────── + // 8. Shutdown: trackWrite waits for in-flight writes, deadline timeout + // ────────────────────────────────────────────────────────────────────── + describe("Shutdown and trackWrite", () => { + beforeEach(() => { + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + test("shutdown waits for in-flight writes to complete", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + + // Simulate an in-flight write + (plugin as any).inflightWrites = 1; + + const shutdownPromise = plugin.shutdown(); + + // After 500ms the shutdown loop should still be waiting + await vi.advanceTimersByTimeAsync(500); + + // Simulate the write completing + (plugin as any).inflightWrites = 0; + + await vi.advanceTimersByTimeAsync(500); + await shutdownPromise; + + // Shutdown should have completed + expect((plugin as any).inflightWrites).toBe(0); + }); + + test("shutdown times out after 10 seconds with pending writes", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const abortAllSpy = vi.spyOn((plugin as any).streamManager, "abortAll"); + + // Simulate an in-flight write that never completes + (plugin as any).inflightWrites = 2; + + const shutdownPromise = plugin.shutdown(); + + // Advance past the 10-second deadline + await vi.advanceTimersByTimeAsync(11_000); + await shutdownPromise; + + // Should still call abortAll even after timeout + expect(abortAllSpy).toHaveBeenCalled(); + // inflightWrites remains > 0 since the writes never completed + expect((plugin as any).inflightWrites).toBe(2); + }); + + test("shutdown completes immediately when no in-flight writes", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const abortAllSpy = vi.spyOn((plugin as any).streamManager, "abortAll"); + + (plugin as any).inflightWrites = 0; + + const shutdownPromise = plugin.shutdown(); + await vi.advanceTimersByTimeAsync(0); + await shutdownPromise; + + expect(abortAllSpy).toHaveBeenCalled(); + }); + + test("trackWrite increments and decrements inflightWrites correctly", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + expect((plugin as any).inflightWrites).toBe(0); + + let resolveInner!: (value: string) => void; + const innerPromise = new Promise((r) => { + resolveInner = r; + }); + + const trackPromise = (plugin as any).trackWrite(() => innerPromise); + + // While the tracked fn is running, inflightWrites should be 1 + expect((plugin as any).inflightWrites).toBe(1); + + resolveInner("done"); + const result = await trackPromise; + + expect(result).toBe("done"); + expect((plugin as any).inflightWrites).toBe(0); + }); + + test("trackWrite decrements inflightWrites even on rejection", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + + const trackPromise = (plugin as any).trackWrite(() => + Promise.reject(new Error("write failed")), + ); + + await expect(trackPromise).rejects.toThrow("write failed"); + expect((plugin as any).inflightWrites).toBe(0); + }); + }); + + // ────────────────────────────────────────────────────────────────────── + // 9. Volume discovery: merging explicit config with env vars + // ────────────────────────────────────────────────────────────────────── + describe("Volume discovery merging", () => { + test("explicit config takes priority over env vars", () => { + const volumes = FilesPlugin.discoverVolumes({ + volumes: { + uploads: { maxUploadSize: 42 }, + custom: { maxUploadSize: 99 }, + }, + }); + + // uploads: explicit config wins (maxUploadSize: 42), not {} from env + expect(volumes.uploads).toEqual({ maxUploadSize: 42 }); + // exports: discovered from env with default empty config + expect(volumes.exports).toEqual({}); + // custom: explicit only, no env var + expect(volumes.custom).toEqual({ maxUploadSize: 99 }); + }); + + test("discovered volumes get empty config objects", () => { + process.env.DATABRICKS_VOLUME_DATA = "/Volumes/catalog/schema/data"; + + try { + const volumes = FilesPlugin.discoverVolumes({}); + expect(volumes.data).toEqual({}); + } finally { + delete process.env.DATABRICKS_VOLUME_DATA; + } + }); + + test("explicit volumes without env vars still appear", () => { + delete process.env.DATABRICKS_VOLUME_UPLOADS; + delete process.env.DATABRICKS_VOLUME_EXPORTS; + + const volumes = FilesPlugin.discoverVolumes({ + volumes: { + private: { maxUploadSize: 10 }, + }, + }); + + expect(Object.keys(volumes)).toEqual(["private"]); + expect(volumes.private).toEqual({ maxUploadSize: 10 }); + }); + + test("env var volume is not added when explicit config has the same key", () => { + process.env.DATABRICKS_VOLUME_SPECIAL = "/Volumes/catalog/schema/special"; + + try { + const volumes = FilesPlugin.discoverVolumes({ + volumes: { + special: { maxUploadSize: 500 }, + }, + }); + + // Explicit wins; should not be overwritten with {} + expect(volumes.special).toEqual({ maxUploadSize: 500 }); + } finally { + delete process.env.DATABRICKS_VOLUME_SPECIAL; + } + }); + }); + + // ────────────────────────────────────────────────────────────────────── + // 10. Path validation edge cases + // ────────────────────────────────────────────────────────────────────── + describe("Path validation", () => { + test("path with null bytes returns 400", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/read"); + const res = mockRes(); + + await handler( + mockReq("uploads", { query: { path: "/Volumes/test/\0evil" } }), + res, + ); + + expect(res.status).toHaveBeenCalledWith(400); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ + error: "path must not contain null bytes", + }), + ); + }); + + test("path exceeding 4096 characters returns 400", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/read"); + const res = mockRes(); + + const longPath = "/Volumes/test/" + "a".repeat(4100); + + await handler(mockReq("uploads", { query: { path: longPath } }), res); + + expect(res.status).toHaveBeenCalledWith(400); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ + error: expect.stringContaining("exceeds maximum length"), + }), + ); + }); + + test("exists without path returns 400", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/exists"); + const res = mockRes(); + + await handler(mockReq("uploads", { query: {} }), res); + + expect(res.status).toHaveBeenCalledWith(400); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ + error: "path is required", + plugin: "files", + }), + ); + }); + + test("metadata without path returns 400", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/metadata"); + const res = mockRes(); + + await handler(mockReq("uploads", { query: {} }), res); + + expect(res.status).toHaveBeenCalledWith(400); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ + error: "path is required", + plugin: "files", + }), + ); + }); + + test("preview without path returns 400", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "get", "/preview"); + const res = mockRes(); + + await handler(mockReq("uploads", { query: {} }), res); + + expect(res.status).toHaveBeenCalledWith(400); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ + error: "path is required", + plugin: "files", + }), + ); + }); + + test("upload without path returns 400", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "post", "/upload"); + const res = mockRes(); + + const req = mockUploadReq("uploads", [Buffer.from("data")], { + query: {}, + }); + + await handler(req, res); + + expect(res.status).toHaveBeenCalledWith(400); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ + error: "path is required", + plugin: "files", + }), + ); + }); + + test("delete with null bytes in path returns 400", async () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const handler = getRouteHandler(plugin, "delete", ""); + const res = mockRes(); + + await handler( + mockReq("uploads", { query: { path: "/Volumes/test/\0evil" } }), + res, + ); + + expect(res.status).toHaveBeenCalledWith(400); + expect(res.json).toHaveBeenCalledWith( + expect.objectContaining({ + error: "path must not contain null bytes", + }), + ); + }); + }); + + // ────────────────────────────────────────────────────────────────────── + // 11. clientConfig returns volume keys + // ────────────────────────────────────────────────────────────────────── + describe("clientConfig", () => { + test("returns configured volume keys", () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const config = plugin.clientConfig(); + + expect(config).toEqual({ volumes: ["uploads", "exports"] }); + }); + + test("returns empty volumes when none configured and no env vars", () => { + delete process.env.DATABRICKS_VOLUME_UPLOADS; + delete process.env.DATABRICKS_VOLUME_EXPORTS; + + const plugin = new FilesPlugin({ volumes: {} }); + const config = plugin.clientConfig(); + + expect(config).toEqual({ volumes: [] }); + }); + }); + + // ────────────────────────────────────────────────────────────────────── + // 12. _sendStatusError uses HTTP status code text + // ────────────────────────────────────────────────────────────────────── + describe("_sendStatusError", () => { + test("sends standard HTTP status text for known codes", () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const res = mockRes(); + + (plugin as any)._sendStatusError(res, 404); + + expect(res.status).toHaveBeenCalledWith(404); + expect(res.json).toHaveBeenCalledWith({ + error: "Not Found", + plugin: "files", + }); + }); + + test("sends 'Unknown Error' for non-standard status codes", () => { + const plugin = new FilesPlugin(VOLUMES_CONFIG); + const res = mockRes(); + + (plugin as any)._sendStatusError(res, 999); + + expect(res.status).toHaveBeenCalledWith(999); + expect(res.json).toHaveBeenCalledWith({ + error: "Unknown Error", + plugin: "files", + }); + }); + }); +}); diff --git a/packages/appkit/src/stream/tests/stream-registry.test.ts b/packages/appkit/src/stream/tests/stream-registry.test.ts new file mode 100644 index 000000000..d3f70e95a --- /dev/null +++ b/packages/appkit/src/stream/tests/stream-registry.test.ts @@ -0,0 +1,582 @@ +import type { Context } from "@opentelemetry/api"; +import { beforeEach, describe, expect, test, vi } from "vitest"; +import { EventRingBuffer } from "../buffers"; +import { StreamRegistry } from "../stream-registry"; +import type { StreamEntry } from "../types"; +import { SSEErrorCode } from "../types"; + +/** Create a minimal mock StreamEntry for testing. */ +function createMockStreamEntry( + streamId: string, + overrides: Partial = {}, +): StreamEntry { + return { + streamId, + generator: (async function* () {})(), + eventBuffer: new EventRingBuffer(10), + clients: new Set(), + isCompleted: false, + lastAccess: Date.now(), + abortController: new AbortController(), + traceContext: {} as Context, + ...overrides, + }; +} + +/** Create a mock response object that mimics express.Response for SSE writes. */ +function createMockClient(writableEnded = false) { + return { + write: vi.fn().mockReturnValue(true), + writableEnded, + } as unknown as import("express").Response; +} + +describe("StreamRegistry", () => { + let registry: StreamRegistry; + + beforeEach(() => { + registry = new StreamRegistry(3); + }); + + describe("add and get", () => { + test("should add a stream and retrieve it by id", () => { + const entry = createMockStreamEntry("stream-1"); + registry.add(entry); + + const result = registry.get("stream-1"); + + expect(result).toBe(entry); + }); + + test("should return null for a non-existent stream", () => { + const result = registry.get("non-existent"); + + expect(result).toBeNull(); + }); + + test("should add multiple streams and retrieve each", () => { + const entry1 = createMockStreamEntry("stream-1"); + const entry2 = createMockStreamEntry("stream-2"); + const entry3 = createMockStreamEntry("stream-3"); + + registry.add(entry1); + registry.add(entry2); + registry.add(entry3); + + expect(registry.get("stream-1")).toBe(entry1); + expect(registry.get("stream-2")).toBe(entry2); + expect(registry.get("stream-3")).toBe(entry3); + }); + }); + + describe("has", () => { + test("should return true for an existing stream", () => { + const entry = createMockStreamEntry("stream-1"); + registry.add(entry); + + expect(registry.has("stream-1")).toBe(true); + }); + + test("should return false for a non-existent stream", () => { + expect(registry.has("non-existent")).toBe(false); + }); + + test("should return false after a stream is removed", () => { + const entry = createMockStreamEntry("stream-1"); + registry.add(entry); + registry.remove("stream-1"); + + expect(registry.has("stream-1")).toBe(false); + }); + }); + + describe("remove", () => { + test("should remove an existing stream", () => { + const entry = createMockStreamEntry("stream-1"); + registry.add(entry); + + registry.remove("stream-1"); + + expect(registry.get("stream-1")).toBeNull(); + expect(registry.size()).toBe(0); + }); + + test("should not throw when removing a non-existent stream", () => { + expect(() => registry.remove("non-existent")).not.toThrow(); + }); + + test("should only remove the specified stream", () => { + const entry1 = createMockStreamEntry("stream-1"); + const entry2 = createMockStreamEntry("stream-2"); + registry.add(entry1); + registry.add(entry2); + + registry.remove("stream-1"); + + expect(registry.get("stream-1")).toBeNull(); + expect(registry.get("stream-2")).toBe(entry2); + expect(registry.size()).toBe(1); + }); + }); + + describe("size", () => { + test("should return 0 for an empty registry", () => { + expect(registry.size()).toBe(0); + }); + + test("should track size as streams are added", () => { + registry.add(createMockStreamEntry("stream-1")); + expect(registry.size()).toBe(1); + + registry.add(createMockStreamEntry("stream-2")); + expect(registry.size()).toBe(2); + + registry.add(createMockStreamEntry("stream-3")); + expect(registry.size()).toBe(3); + }); + + test("should decrease when streams are removed", () => { + registry.add(createMockStreamEntry("stream-1")); + registry.add(createMockStreamEntry("stream-2")); + + registry.remove("stream-1"); + + expect(registry.size()).toBe(1); + }); + + test("should not exceed capacity after eviction", () => { + registry.add(createMockStreamEntry("stream-1", { lastAccess: 100 })); + registry.add(createMockStreamEntry("stream-2", { lastAccess: 200 })); + registry.add(createMockStreamEntry("stream-3", { lastAccess: 300 })); + + // Adding a fourth stream to a capacity-3 registry triggers eviction + registry.add(createMockStreamEntry("stream-4", { lastAccess: 400 })); + + expect(registry.size()).toBe(3); + }); + }); + + describe("capacity enforcement and eviction", () => { + test("should evict the oldest stream when at capacity", () => { + registry.add(createMockStreamEntry("stream-1", { lastAccess: 100 })); + registry.add(createMockStreamEntry("stream-2", { lastAccess: 200 })); + registry.add(createMockStreamEntry("stream-3", { lastAccess: 300 })); + + // Adding a fourth should evict stream-1 (oldest lastAccess=100) + registry.add(createMockStreamEntry("stream-4", { lastAccess: 400 })); + + expect(registry.has("stream-1")).toBe(false); + expect(registry.has("stream-2")).toBe(true); + expect(registry.has("stream-3")).toBe(true); + expect(registry.has("stream-4")).toBe(true); + }); + + test("should evict the stream with the smallest lastAccess and abort it", () => { + // When lastAccess order matches insertion order, the eviction logic + // cleanly targets the LRU stream. The stream with the smallest + // lastAccess is found and aborted. + const ac1 = new AbortController(); + const ac2 = new AbortController(); + const ac3 = new AbortController(); + + registry.add( + createMockStreamEntry("stream-1", { + lastAccess: 100, + abortController: ac1, + }), + ); + registry.add( + createMockStreamEntry("stream-2", { + lastAccess: 300, + abortController: ac2, + }), + ); + registry.add( + createMockStreamEntry("stream-3", { + lastAccess: 200, + abortController: ac3, + }), + ); + + // Adding stream-4 triggers eviction. stream-1 has the smallest + // lastAccess (100) so it should be targeted. + registry.add(createMockStreamEntry("stream-4", { lastAccess: 400 })); + + expect(ac1.signal.aborted).toBe(true); + expect(ac2.signal.aborted).toBe(false); + expect(ac3.signal.aborted).toBe(false); + expect(registry.has("stream-1")).toBe(false); + expect(registry.has("stream-4")).toBe(true); + }); + + test("should exclude the stream being added from eviction", () => { + // This tests the excludeStreamId parameter: if a stream with the same + // ID as the one being added already exists and is the oldest, it should + // still be excluded from eviction. In practice, the new stream won't be + // in the registry yet when eviction runs, so excludeStreamId prevents + // misidentification. + registry.add(createMockStreamEntry("stream-1", { lastAccess: 100 })); + registry.add(createMockStreamEntry("stream-2", { lastAccess: 200 })); + registry.add(createMockStreamEntry("stream-3", { lastAccess: 300 })); + + // Add stream with id "stream-1" again; eviction should skip "stream-1" + // even though stream-1 has the oldest lastAccess, because it's the + // excludeStreamId. stream-2 should be evicted instead. + registry.add(createMockStreamEntry("stream-1", { lastAccess: 400 })); + + // stream-1 is updated (RingBuffer updates existing keys in place) + expect(registry.has("stream-1")).toBe(true); + // stream-2 should have been evicted as it was the oldest non-excluded + expect(registry.has("stream-2")).toBe(false); + expect(registry.has("stream-3")).toBe(true); + }); + + test("should abort the evicted stream's AbortController", () => { + const abortController1 = new AbortController(); + registry.add( + createMockStreamEntry("stream-1", { + lastAccess: 100, + abortController: abortController1, + }), + ); + registry.add(createMockStreamEntry("stream-2", { lastAccess: 200 })); + registry.add(createMockStreamEntry("stream-3", { lastAccess: 300 })); + + registry.add(createMockStreamEntry("stream-4", { lastAccess: 400 })); + + expect(abortController1.signal.aborted).toBe(true); + }); + + test("should abort with 'Stream evicted' reason", () => { + const abortController1 = new AbortController(); + registry.add( + createMockStreamEntry("stream-1", { + lastAccess: 100, + abortController: abortController1, + }), + ); + registry.add(createMockStreamEntry("stream-2", { lastAccess: 200 })); + registry.add(createMockStreamEntry("stream-3", { lastAccess: 300 })); + + registry.add(createMockStreamEntry("stream-4", { lastAccess: 400 })); + + expect(abortController1.signal.reason).toBe("Stream evicted"); + }); + }); + + describe("eviction SSE broadcast", () => { + test("should send STREAM_EVICTED error to all clients of evicted stream", () => { + const client1 = createMockClient(); + const client2 = createMockClient(); + + const clients = new Set([client1, client2]); + + registry.add( + createMockStreamEntry("stream-1", { + lastAccess: 100, + clients, + }), + ); + registry.add(createMockStreamEntry("stream-2", { lastAccess: 200 })); + registry.add(createMockStreamEntry("stream-3", { lastAccess: 300 })); + + // Trigger eviction of stream-1 + registry.add(createMockStreamEntry("stream-4", { lastAccess: 400 })); + + // Each client should have received the SSE error event + for (const client of [client1, client2]) { + expect(client.write).toHaveBeenCalledWith("event: error\n"); + expect(client.write).toHaveBeenCalledWith( + `data: ${JSON.stringify({ error: "Stream evicted", code: SSEErrorCode.STREAM_EVICTED })}\n\n`, + ); + } + }); + + test("should skip clients with writableEnded=true during eviction broadcast", () => { + const activeClient = createMockClient(false); + const endedClient = createMockClient(true); + + const clients = new Set([activeClient, endedClient]); + + registry.add( + createMockStreamEntry("stream-1", { + lastAccess: 100, + clients, + }), + ); + registry.add(createMockStreamEntry("stream-2", { lastAccess: 200 })); + registry.add(createMockStreamEntry("stream-3", { lastAccess: 300 })); + + registry.add(createMockStreamEntry("stream-4", { lastAccess: 400 })); + + // Active client should receive the error + expect(activeClient.write).toHaveBeenCalledWith("event: error\n"); + + // Ended client should NOT receive any writes + expect(endedClient.write).not.toHaveBeenCalled(); + }); + + test("should handle client.write throwing an error gracefully", () => { + const throwingClient = createMockClient(false); + (throwingClient.write as ReturnType).mockImplementation( + () => { + throw new Error("Connection reset"); + }, + ); + + const normalClient = createMockClient(false); + + const clients = new Set([throwingClient, normalClient]); + + registry.add( + createMockStreamEntry("stream-1", { + lastAccess: 100, + clients, + }), + ); + registry.add(createMockStreamEntry("stream-2", { lastAccess: 200 })); + registry.add(createMockStreamEntry("stream-3", { lastAccess: 300 })); + + // Should not throw despite the throwing client + expect(() => { + registry.add(createMockStreamEntry("stream-4", { lastAccess: 400 })); + }).not.toThrow(); + + // The normal client should still receive the error despite the other + // client throwing. Note: both clients are in a Set, iteration order is + // insertion order. The throwing client's error is caught per-client. + // We verify the abort still happened (the overall eviction completed). + expect(registry.has("stream-1")).toBe(false); + expect(registry.has("stream-4")).toBe(true); + }); + + test("should send correct SSE error format with STREAM_EVICTED code", () => { + const client = createMockClient(); + const clients = new Set([client]); + + registry.add( + createMockStreamEntry("stream-1", { + lastAccess: 100, + clients, + }), + ); + registry.add(createMockStreamEntry("stream-2", { lastAccess: 200 })); + registry.add(createMockStreamEntry("stream-3", { lastAccess: 300 })); + + registry.add(createMockStreamEntry("stream-4", { lastAccess: 400 })); + + // Verify the exact data payload + const dataCall = ( + client.write as ReturnType + ).mock.calls.find((call: unknown[]) => + (call[0] as string).startsWith("data:"), + ); + expect(dataCall).toBeDefined(); + + const payload = JSON.parse( + (dataCall![0] as string).replace("data: ", "").trim(), + ); + expect(payload).toEqual({ + error: "Stream evicted", + code: "STREAM_EVICTED", + }); + }); + + test("should broadcast to multiple clients on the same evicted stream", () => { + const client1 = createMockClient(); + const client2 = createMockClient(); + const client3 = createMockClient(); + + const clients = new Set([client1, client2, client3]); + + registry.add( + createMockStreamEntry("stream-1", { + lastAccess: 100, + clients, + }), + ); + registry.add(createMockStreamEntry("stream-2", { lastAccess: 200 })); + registry.add(createMockStreamEntry("stream-3", { lastAccess: 300 })); + + registry.add(createMockStreamEntry("stream-4", { lastAccess: 400 })); + + // All three clients should have received exactly 2 write calls each + // (one for "event: error\n" and one for the data line) + for (const client of [client1, client2, client3]) { + expect(client.write).toHaveBeenCalledTimes(2); + } + }); + + test("should not broadcast if evicted stream has no clients", () => { + const abortController = new AbortController(); + + registry.add( + createMockStreamEntry("stream-1", { + lastAccess: 100, + clients: new Set(), + abortController, + }), + ); + registry.add(createMockStreamEntry("stream-2", { lastAccess: 200 })); + registry.add(createMockStreamEntry("stream-3", { lastAccess: 300 })); + + // Should not throw even with no clients + expect(() => { + registry.add(createMockStreamEntry("stream-4", { lastAccess: 400 })); + }).not.toThrow(); + + // Stream should still be evicted and aborted + expect(registry.has("stream-1")).toBe(false); + expect(abortController.signal.aborted).toBe(true); + }); + }); + + describe("clear", () => { + test("should abort all streams and clear the registry", () => { + const ac1 = new AbortController(); + const ac2 = new AbortController(); + const ac3 = new AbortController(); + + registry.add(createMockStreamEntry("stream-1", { abortController: ac1 })); + registry.add(createMockStreamEntry("stream-2", { abortController: ac2 })); + registry.add(createMockStreamEntry("stream-3", { abortController: ac3 })); + + registry.clear(); + + expect(registry.size()).toBe(0); + expect(ac1.signal.aborted).toBe(true); + expect(ac2.signal.aborted).toBe(true); + expect(ac3.signal.aborted).toBe(true); + }); + + test("should abort with 'Server shutdown' reason", () => { + const ac = new AbortController(); + registry.add(createMockStreamEntry("stream-1", { abortController: ac })); + + registry.clear(); + + expect(ac.signal.reason).toBe("Server shutdown"); + }); + + test("should handle clearing an empty registry", () => { + expect(() => registry.clear()).not.toThrow(); + expect(registry.size()).toBe(0); + }); + + test("should make all streams inaccessible after clear", () => { + registry.add(createMockStreamEntry("stream-1")); + registry.add(createMockStreamEntry("stream-2")); + + registry.clear(); + + expect(registry.get("stream-1")).toBeNull(); + expect(registry.get("stream-2")).toBeNull(); + expect(registry.has("stream-1")).toBe(false); + expect(registry.has("stream-2")).toBe(false); + }); + + test("should allow adding new streams after clear", () => { + registry.add(createMockStreamEntry("stream-1")); + registry.clear(); + + const newEntry = createMockStreamEntry("stream-new"); + registry.add(newEntry); + + expect(registry.get("stream-new")).toBe(newEntry); + expect(registry.size()).toBe(1); + }); + }); + + describe("edge cases", () => { + test("should work with capacity of 1", () => { + const smallRegistry = new StreamRegistry(1); + const ac1 = new AbortController(); + + smallRegistry.add( + createMockStreamEntry("stream-1", { + lastAccess: 100, + abortController: ac1, + }), + ); + expect(smallRegistry.size()).toBe(1); + + smallRegistry.add(createMockStreamEntry("stream-2", { lastAccess: 200 })); + + expect(smallRegistry.size()).toBe(1); + expect(smallRegistry.has("stream-1")).toBe(false); + expect(smallRegistry.has("stream-2")).toBe(true); + expect(ac1.signal.aborted).toBe(true); + }); + + test("should handle adding a stream with the same id (update)", () => { + const entry1 = createMockStreamEntry("stream-1", { + lastAccess: 100, + }); + const entry2 = createMockStreamEntry("stream-1", { + lastAccess: 200, + }); + + registry.add(entry1); + registry.add(entry2); + + // The RingBuffer updates in place for same key + expect(registry.size()).toBe(1); + const retrieved = registry.get("stream-1"); + expect(retrieved?.lastAccess).toBe(200); + }); + + test("should handle sequential evictions correctly", () => { + registry.add(createMockStreamEntry("stream-1", { lastAccess: 100 })); + registry.add(createMockStreamEntry("stream-2", { lastAccess: 200 })); + registry.add(createMockStreamEntry("stream-3", { lastAccess: 300 })); + + // First eviction: stream-1 evicted + registry.add(createMockStreamEntry("stream-4", { lastAccess: 400 })); + expect(registry.has("stream-1")).toBe(false); + + // Second eviction: stream-2 evicted + registry.add(createMockStreamEntry("stream-5", { lastAccess: 500 })); + expect(registry.has("stream-2")).toBe(false); + + // stream-3, stream-4, stream-5 remain + expect(registry.has("stream-3")).toBe(true); + expect(registry.has("stream-4")).toBe(true); + expect(registry.has("stream-5")).toBe(true); + expect(registry.size()).toBe(3); + }); + + test("should not evict when under capacity", () => { + const ac1 = new AbortController(); + registry.add(createMockStreamEntry("stream-1", { abortController: ac1 })); + registry.add(createMockStreamEntry("stream-2")); + + // Only 2 streams in a capacity-3 registry, no eviction + expect(registry.size()).toBe(2); + expect(ac1.signal.aborted).toBe(false); + }); + + test("should handle mixed writable states during eviction", () => { + const activeClient = createMockClient(false); + const endedClient1 = createMockClient(true); + const endedClient2 = createMockClient(true); + + const clients = new Set([endedClient1, activeClient, endedClient2]); + + registry.add( + createMockStreamEntry("stream-1", { + lastAccess: 100, + clients, + }), + ); + registry.add(createMockStreamEntry("stream-2", { lastAccess: 200 })); + registry.add(createMockStreamEntry("stream-3", { lastAccess: 300 })); + + registry.add(createMockStreamEntry("stream-4", { lastAccess: 400 })); + + // Only the active client should receive writes + expect(activeClient.write).toHaveBeenCalledTimes(2); + expect(endedClient1.write).not.toHaveBeenCalled(); + expect(endedClient2.write).not.toHaveBeenCalled(); + }); + }); +}); From a003274c15972b6e5c7aa546cafb6e867c65d027 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Thu, 16 Apr 2026 13:15:39 +0000 Subject: [PATCH 09/17] refactor: use API enum names (JSON_ARRAY, ARROW_STREAM) and simplify format model Address Mario's review: collapse three formats (JSON, ARROW, ARROW_STREAM) into two that match the Databricks API enums. ARROW_STREAM supports both INLINE and EXTERNAL_LINKS dispositions with automatic fallback. Default remains JSON_ARRAY per reviewer request. Co-authored-by: Isaac --- .../src/react/charts/__tests__/types.test.ts | 2 +- packages/appkit-ui/src/react/charts/types.ts | 6 +- .../hooks/__tests__/use-chart-data.test.ts | 40 +++---- packages/appkit-ui/src/react/hooks/types.ts | 10 +- .../src/react/hooks/use-analytics-query.ts | 14 +-- .../src/react/hooks/use-chart-data.ts | 25 ++-- .../appkit/src/plugins/analytics/analytics.ts | 112 ++++++------------ .../plugins/analytics/tests/analytics.test.ts | 109 +++-------------- .../appkit/src/plugins/analytics/types.ts | 2 +- 9 files changed, 104 insertions(+), 216 deletions(-) diff --git a/packages/appkit-ui/src/react/charts/__tests__/types.test.ts b/packages/appkit-ui/src/react/charts/__tests__/types.test.ts index 13394dcf6..d6685ce01 100644 --- a/packages/appkit-ui/src/react/charts/__tests__/types.test.ts +++ b/packages/appkit-ui/src/react/charts/__tests__/types.test.ts @@ -93,7 +93,7 @@ describe("isQueryProps", () => { const props = { queryKey: "test_query", parameters: { limit: 100 }, - format: "json" as const, + format: "json_array" as const, }; expect(isQueryProps(props as any)).toBe(true); diff --git a/packages/appkit-ui/src/react/charts/types.ts b/packages/appkit-ui/src/react/charts/types.ts index fdcc55f1a..ec8a15dc2 100644 --- a/packages/appkit-ui/src/react/charts/types.ts +++ b/packages/appkit-ui/src/react/charts/types.ts @@ -5,7 +5,7 @@ import type { Table } from "apache-arrow"; // ============================================================================ /** Supported data formats for analytics queries */ -export type DataFormat = "json" | "arrow" | "arrow_stream" | "auto"; +export type DataFormat = "json_array" | "arrow_stream" | "auto"; /** Chart orientation */ export type Orientation = "vertical" | "horizontal"; @@ -77,8 +77,8 @@ export interface QueryProps extends ChartBaseProps { parameters?: Record; /** * Data format to use - * - "json": Use JSON format (smaller payloads, simpler) - * - "arrow": Use Arrow format (faster for large datasets) + * - "json_array": Use JSON format (smaller payloads, simpler) + * - "arrow_stream": Use Arrow format (faster for large datasets) * - "auto": Automatically select based on expected data size * @default "auto" */ diff --git a/packages/appkit-ui/src/react/hooks/__tests__/use-chart-data.test.ts b/packages/appkit-ui/src/react/hooks/__tests__/use-chart-data.test.ts index 32ce52cb2..686aff317 100644 --- a/packages/appkit-ui/src/react/hooks/__tests__/use-chart-data.test.ts +++ b/packages/appkit-ui/src/react/hooks/__tests__/use-chart-data.test.ts @@ -72,7 +72,7 @@ describe("useChartData", () => { }); describe("format selection", () => { - test("uses JSON format when explicitly specified", () => { + test("uses JSON_ARRAY format when explicitly specified", () => { mockUseAnalyticsQuery.mockReturnValue({ data: [], loading: false, @@ -82,18 +82,18 @@ describe("useChartData", () => { renderHook(() => useChartData({ queryKey: "test", - format: "json", + format: "json_array", }), ); expect(mockUseAnalyticsQuery).toHaveBeenCalledWith( "test", undefined, - expect.objectContaining({ format: "JSON" }), + expect.objectContaining({ format: "JSON_ARRAY" }), ); }); - test("uses ARROW format when explicitly specified", () => { + test("uses ARROW_STREAM format when explicitly specified", () => { mockUseAnalyticsQuery.mockReturnValue({ data: [], loading: false, @@ -103,18 +103,18 @@ describe("useChartData", () => { renderHook(() => useChartData({ queryKey: "test", - format: "arrow", + format: "arrow_stream", }), ); expect(mockUseAnalyticsQuery).toHaveBeenCalledWith( "test", undefined, - expect.objectContaining({ format: "ARROW" }), + expect.objectContaining({ format: "ARROW_STREAM" }), ); }); - test("auto-selects ARROW for large limit", () => { + test("auto-selects ARROW_STREAM for large limit", () => { mockUseAnalyticsQuery.mockReturnValue({ data: [], loading: false, @@ -132,11 +132,11 @@ describe("useChartData", () => { expect(mockUseAnalyticsQuery).toHaveBeenCalledWith( "test", { limit: 1000 }, - expect.objectContaining({ format: "ARROW" }), + expect.objectContaining({ format: "ARROW_STREAM" }), ); }); - test("auto-selects ARROW for date range queries", () => { + test("auto-selects ARROW_STREAM for date range queries", () => { mockUseAnalyticsQuery.mockReturnValue({ data: [], loading: false, @@ -157,7 +157,7 @@ describe("useChartData", () => { expect(mockUseAnalyticsQuery).toHaveBeenCalledWith( "test", expect.objectContaining({ startDate: "2025-01-01" }), - expect.objectContaining({ format: "ARROW" }), + expect.objectContaining({ format: "ARROW_STREAM" }), ); }); @@ -179,7 +179,7 @@ describe("useChartData", () => { expect(mockUseAnalyticsQuery).toHaveBeenCalledWith( "test", expect.anything(), - expect.objectContaining({ format: "JSON" }), + expect.objectContaining({ format: "JSON_ARRAY" }), ); }); @@ -201,11 +201,11 @@ describe("useChartData", () => { expect(mockUseAnalyticsQuery).toHaveBeenCalledWith( "test", expect.anything(), - expect.objectContaining({ format: "ARROW" }), + expect.objectContaining({ format: "ARROW_STREAM" }), ); }); - test("auto-selects ARROW_STREAM by default when no heuristics match", () => { + test("auto-selects JSON_ARRAY by default when no heuristics match", () => { mockUseAnalyticsQuery.mockReturnValue({ data: [], loading: false, @@ -223,11 +223,11 @@ describe("useChartData", () => { expect(mockUseAnalyticsQuery).toHaveBeenCalledWith( "test", { limit: 100 }, - expect.objectContaining({ format: "ARROW_STREAM" }), + expect.objectContaining({ format: "JSON_ARRAY" }), ); }); - test("defaults to auto format (ARROW_STREAM) when format is not specified", () => { + test("defaults to auto format (JSON_ARRAY) when format is not specified", () => { mockUseAnalyticsQuery.mockReturnValue({ data: [], loading: false, @@ -243,7 +243,7 @@ describe("useChartData", () => { expect(mockUseAnalyticsQuery).toHaveBeenCalledWith( "test", undefined, - expect.objectContaining({ format: "ARROW_STREAM" }), + expect.objectContaining({ format: "JSON_ARRAY" }), ); }); }); @@ -353,7 +353,7 @@ describe("useChartData", () => { expect(result.current.isArrow).toBe(false); }); - test("isArrow reflects requested ARROW format when data is null", () => { + test("isArrow reflects requested ARROW_STREAM format when data is null", () => { mockUseAnalyticsQuery.mockReturnValue({ data: null, loading: true, @@ -361,13 +361,13 @@ describe("useChartData", () => { }); const { result } = renderHook(() => - useChartData({ queryKey: "test", format: "arrow" }), + useChartData({ queryKey: "test", format: "arrow_stream" }), ); expect(result.current.isArrow).toBe(true); }); - test("isArrow reflects requested JSON format when data is null", () => { + test("isArrow reflects requested JSON_ARRAY format when data is null", () => { mockUseAnalyticsQuery.mockReturnValue({ data: null, loading: true, @@ -375,7 +375,7 @@ describe("useChartData", () => { }); const { result } = renderHook(() => - useChartData({ queryKey: "test", format: "json" }), + useChartData({ queryKey: "test", format: "json_array" }), ); expect(result.current.isArrow).toBe(false); diff --git a/packages/appkit-ui/src/react/hooks/types.ts b/packages/appkit-ui/src/react/hooks/types.ts index 26406f140..6ee08fe54 100644 --- a/packages/appkit-ui/src/react/hooks/types.ts +++ b/packages/appkit-ui/src/react/hooks/types.ts @@ -5,7 +5,7 @@ import type { Table } from "apache-arrow"; // ============================================================================ /** Supported response formats for analytics queries */ -export type AnalyticsFormat = "JSON" | "ARROW" | "ARROW_STREAM"; +export type AnalyticsFormat = "JSON_ARRAY" | "ARROW_STREAM"; /** * Typed Arrow Table - preserves row type information for type inference. @@ -33,9 +33,9 @@ export interface TypedArrowTable< /** Options for configuring an analytics SSE query */ export interface UseAnalyticsQueryOptions< - F extends AnalyticsFormat = "ARROW_STREAM", + F extends AnalyticsFormat = "JSON_ARRAY", > { - /** Response format - "ARROW_STREAM" (default) uses inline Arrow, "JSON" returns typed arrays, "ARROW" uses external links */ + /** Response format - "JSON_ARRAY" (default) returns typed arrays, "ARROW_STREAM" uses Arrow (inline or external links) */ format?: F; /** Maximum size of serialized parameters in bytes */ @@ -122,7 +122,9 @@ export type InferResultByFormat< T, K, F extends AnalyticsFormat, -> = F extends "ARROW" ? TypedArrowTable> : InferResult; +> = F extends "ARROW_STREAM" + ? TypedArrowTable> + : InferResult; /** * Infers parameters type from QueryRegistry[K]["parameters"] diff --git a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts index 7d13648f4..314bd6e4c 100644 --- a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts +++ b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts @@ -27,8 +27,8 @@ function getArrowStreamUrl(id: string) { * Integration hook between client and analytics plugin. * * The return type is automatically inferred based on the format: - * - `format: "JSON"` (default): Returns typed array from QueryRegistry - * - `format: "ARROW"`: Returns TypedArrowTable with row type preserved + * - `format: "JSON_ARRAY"` (default): Returns typed array from QueryRegistry + * - `format: "ARROW_STREAM"`: Returns TypedArrowTable with row type preserved * * Note: User context execution is determined by query file naming: * - `queryKey.obo.sql`: Executes as user (OBO = on-behalf-of / user delegation) @@ -39,28 +39,28 @@ function getArrowStreamUrl(id: string) { * @param options - Analytics query settings including format * @returns Query result state with format-appropriate data type * - * @example JSON format (default) + * @example JSON_ARRAY format (default) * ```typescript * const { data } = useAnalyticsQuery("spend_data", params); * // data: Array<{ group_key: string; cost_usd: number; ... }> | null * ``` * - * @example Arrow format + * @example ARROW_STREAM format * ```typescript - * const { data } = useAnalyticsQuery("spend_data", params, { format: "ARROW" }); + * const { data } = useAnalyticsQuery("spend_data", params, { format: "ARROW_STREAM" }); * // data: TypedArrowTable<{ group_key: string; cost_usd: number; ... }> | null * ``` */ export function useAnalyticsQuery< T = unknown, K extends QueryKey = QueryKey, - F extends AnalyticsFormat = "ARROW_STREAM", + F extends AnalyticsFormat = "JSON_ARRAY", >( queryKey: K, parameters?: InferParams | null, options: UseAnalyticsQueryOptions = {} as UseAnalyticsQueryOptions, ): UseAnalyticsQueryResult> { - const format = options?.format ?? "ARROW_STREAM"; + const format = options?.format ?? "JSON_ARRAY"; const maxParametersSize = options?.maxParametersSize ?? 100 * 1024; const autoStart = options?.autoStart ?? true; diff --git a/packages/appkit-ui/src/react/hooks/use-chart-data.ts b/packages/appkit-ui/src/react/hooks/use-chart-data.ts index 1d1da2dda..ec4b2d4ee 100644 --- a/packages/appkit-ui/src/react/hooks/use-chart-data.ts +++ b/packages/appkit-ui/src/react/hooks/use-chart-data.ts @@ -17,8 +17,8 @@ export interface UseChartDataOptions { parameters?: Record; /** * Data format preference - * - "json": Force JSON format - * - "arrow": Force Arrow format + * - "json_array": Force JSON format + * - "arrow_stream": Force Arrow format * - "auto": Auto-select based on heuristics * @default "auto" */ @@ -50,33 +50,32 @@ export interface UseChartDataResult { function resolveFormat( format: DataFormat, parameters?: Record, -): "JSON" | "ARROW" | "ARROW_STREAM" { +): "JSON_ARRAY" | "ARROW_STREAM" { // Explicit format selection - if (format === "json") return "JSON"; - if (format === "arrow") return "ARROW"; + if (format === "json_array") return "JSON_ARRAY"; if (format === "arrow_stream") return "ARROW_STREAM"; // Auto-selection heuristics if (format === "auto") { // Check for explicit hint in parameters - if (parameters?._preferArrow === true) return "ARROW"; - if (parameters?._preferJson === true) return "JSON"; + if (parameters?._preferArrow === true) return "ARROW_STREAM"; + if (parameters?._preferJson === true) return "JSON_ARRAY"; // Check limit parameter as data size hint const limit = parameters?.limit; if (typeof limit === "number" && limit > ARROW_THRESHOLD) { - return "ARROW"; + return "ARROW_STREAM"; } // Check for date range queries (often large) if (parameters?.startDate && parameters?.endDate) { - return "ARROW"; + return "ARROW_STREAM"; } - return "ARROW_STREAM"; + return "JSON_ARRAY"; } - return "ARROW_STREAM"; + return "JSON_ARRAY"; } // ============================================================================ @@ -98,7 +97,7 @@ function resolveFormat( * // Force Arrow format * const { data } = useChartData({ * queryKey: "big_query", - * format: "arrow" + * format: "arrow_stream" * }); * ``` */ @@ -111,7 +110,7 @@ export function useChartData(options: UseChartDataOptions): UseChartDataResult { [format, parameters], ); - const isArrowFormat = resolvedFormat === "ARROW"; + const isArrowFormat = resolvedFormat === "ARROW_STREAM"; // Fetch data using the analytics query hook const { diff --git a/packages/appkit/src/plugins/analytics/analytics.ts b/packages/appkit/src/plugins/analytics/analytics.ts index d73c5bbe6..3738ea787 100644 --- a/packages/appkit/src/plugins/analytics/analytics.ts +++ b/packages/appkit/src/plugins/analytics/analytics.ts @@ -116,7 +116,7 @@ export class AnalyticsPlugin extends Plugin { res: express.Response, ): Promise { const { query_key } = req.params; - const { parameters, format = "ARROW_STREAM" } = + const { parameters, format = "JSON_ARRAY" } = req.body as IAnalyticsQueryRequest; // Request-scoped logging with WideEvent tracking @@ -194,33 +194,12 @@ export class AnalyticsPlugin extends Plugin { ); } - /** Format configurations in fallback order. */ - private static readonly FORMAT_CONFIGS = { - ARROW_STREAM: { - formatParameters: { disposition: "INLINE", format: "ARROW_STREAM" }, - type: "result" as const, - }, - JSON: { - formatParameters: { disposition: "INLINE", format: "JSON_ARRAY" }, - type: "result" as const, - }, - ARROW: { - formatParameters: { - disposition: "EXTERNAL_LINKS", - format: "ARROW_STREAM", - }, - type: "arrow" as const, - }, - }; - /** - * Execute a query with automatic format fallback. - * - * For the default ARROW_STREAM format, tries formats in order until one - * succeeds: ARROW_STREAM → JSON → ARROW. This handles warehouses that - * only support a subset of format/disposition combinations. + * Execute a query with automatic disposition fallback for ARROW_STREAM. * - * Explicit format requests (JSON, ARROW) are not retried. + * - JSON_ARRAY: always uses INLINE disposition, no fallback. + * - ARROW_STREAM: tries INLINE first, falls back to EXTERNAL_LINKS. + * This handles warehouses that only support one disposition. */ private async _executeWithFormatFallback( executor: AnalyticsPlugin, @@ -231,63 +210,50 @@ export class AnalyticsPlugin extends Plugin { requestedFormat: AnalyticsFormat, signal?: AbortSignal, ): Promise<{ type: string; [key: string]: any }> { - // Explicit format — no fallback. - if (requestedFormat === "JSON" || requestedFormat === "ARROW") { - const config = AnalyticsPlugin.FORMAT_CONFIGS[requestedFormat]; + if (requestedFormat === "JSON_ARRAY") { const result = await executor.query( query, processedParams, - config.formatParameters, + { disposition: "INLINE", format: "JSON_ARRAY" }, signal, ); - return { type: config.type, ...result }; + return { type: "result", ...result }; } - // Default (ARROW_STREAM) — try each format in order. - const fallbackOrder: AnalyticsFormat[] = ["ARROW_STREAM", "JSON", "ARROW"]; - - for (let i = 0; i < fallbackOrder.length; i++) { - const fmt = fallbackOrder[i]; - const config = AnalyticsPlugin.FORMAT_CONFIGS[fmt]; - try { - const result = await executor.query( - query, - processedParams, - config.formatParameters, - signal, - ); - if (i > 0) { - logger.info( - "Query succeeded with fallback format %s (preferred %s was rejected)", - fmt, - fallbackOrder[0], - ); - } - return { type: config.type, ...result }; - } catch (err: unknown) { - const msg = err instanceof Error ? err.message : String(err); - const isFormatError = - msg.includes("ARROW_STREAM") || - msg.includes("JSON_ARRAY") || - msg.includes("EXTERNAL_LINKS") || - msg.includes("INVALID_PARAMETER_VALUE") || - msg.includes("NOT_IMPLEMENTED"); - - if (!isFormatError || i === fallbackOrder.length - 1) { - throw err; - } - - logger.warn( - "Format %s rejected by warehouse, falling back to %s: %s", - fmt, - fallbackOrder[i + 1], - msg, - ); + // ARROW_STREAM: try INLINE first, fall back to EXTERNAL_LINKS. + try { + const result = await executor.query( + query, + processedParams, + { disposition: "INLINE", format: "ARROW_STREAM" }, + signal, + ); + return { type: "result", ...result }; + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + const isFormatError = + msg.includes("ARROW_STREAM") || + msg.includes("INLINE") || + msg.includes("INVALID_PARAMETER_VALUE") || + msg.includes("NOT_IMPLEMENTED"); + + if (!isFormatError) { + throw err; } + + logger.warn( + "ARROW_STREAM INLINE rejected by warehouse, falling back to EXTERNAL_LINKS: %s", + msg, + ); } - // Unreachable — last format in fallbackOrder throws on failure. - throw new Error("All format fallbacks exhausted"); + const result = await executor.query( + query, + processedParams, + { disposition: "EXTERNAL_LINKS", format: "ARROW_STREAM" }, + signal, + ); + return { type: "arrow", ...result }; } /** diff --git a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts index f39b07887..5a477763f 100644 --- a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts +++ b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts @@ -621,44 +621,7 @@ describe("Analytics Plugin", () => { ); }); - test("/query/:query_key should pass EXTERNAL_LINKS + ARROW_STREAM format parameters when format is ARROW", async () => { - const plugin = new AnalyticsPlugin(config); - const { router, getHandler } = createMockRouter(); - - (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ - query: "SELECT * FROM test", - isAsUser: false, - }); - - const executeMock = vi.fn().mockResolvedValue({ - result: { data: [{ id: 1 }] }, - }); - (plugin as any).SQLClient.executeStatement = executeMock; - - plugin.injectRoutes(router); - - const handler = getHandler("POST", "/query/:query_key"); - const mockReq = createMockRequest({ - params: { query_key: "test_query" }, - body: { parameters: {}, format: "ARROW" }, - }); - const mockRes = createMockResponse(); - - await handler(mockReq, mockRes); - - expect(executeMock).toHaveBeenCalledWith( - expect.anything(), - expect.objectContaining({ - statement: "SELECT * FROM test", - warehouse_id: "test-warehouse-id", - disposition: "EXTERNAL_LINKS", - format: "ARROW_STREAM", - }), - expect.any(AbortSignal), - ); - }); - - test("/query/:query_key should use INLINE + ARROW_STREAM by default when no format specified", async () => { + test("/query/:query_key should use INLINE + JSON_ARRAY by default when no format specified", async () => { const plugin = new AnalyticsPlugin(config); const { router, getHandler } = createMockRouter(); @@ -687,13 +650,13 @@ describe("Analytics Plugin", () => { expect.anything(), expect.objectContaining({ disposition: "INLINE", - format: "ARROW_STREAM", + format: "JSON_ARRAY", }), expect.any(AbortSignal), ); }); - test("/query/:query_key should not pass format parameters when format is explicitly JSON", async () => { + test("/query/:query_key should pass INLINE + JSON_ARRAY when format is explicitly JSON_ARRAY", async () => { const plugin = new AnalyticsPlugin(config); const { router, getHandler } = createMockRouter(); @@ -712,7 +675,7 @@ describe("Analytics Plugin", () => { const handler = getHandler("POST", "/query/:query_key"); const mockReq = createMockRequest({ params: { query_key: "test_query" }, - body: { parameters: {}, format: "JSON" }, + body: { parameters: {}, format: "JSON_ARRAY" }, }); const mockRes = createMockResponse(); @@ -724,7 +687,7 @@ describe("Analytics Plugin", () => { }); }); - test("/query/:query_key should fall back from ARROW_STREAM to JSON when warehouse rejects ARROW_STREAM", async () => { + test("/query/:query_key should fall back ARROW_STREAM from INLINE to EXTERNAL_LINKS when warehouse rejects INLINE", async () => { const plugin = new AnalyticsPlugin(config); const { router, getHandler } = createMockRouter(); @@ -737,11 +700,11 @@ describe("Analytics Plugin", () => { .fn() .mockRejectedValueOnce( new Error( - "INVALID_PARAMETER_VALUE: Inline disposition only supports JSON_ARRAY format", + "INVALID_PARAMETER_VALUE: ARROW_STREAM not supported with INLINE disposition", ), ) .mockResolvedValueOnce({ - result: { data: [{ id: 1 }] }, + result: { statement_id: "stmt-1", status: { state: "SUCCEEDED" } }, }); (plugin as any).SQLClient.executeStatement = executeMock; @@ -750,60 +713,19 @@ describe("Analytics Plugin", () => { const handler = getHandler("POST", "/query/:query_key"); const mockReq = createMockRequest({ params: { query_key: "test_query" }, - body: { parameters: {} }, + body: { parameters: {}, format: "ARROW_STREAM" }, }); const mockRes = createMockResponse(); await handler(mockReq, mockRes); - // First call: ARROW_STREAM (rejected) + // First call: INLINE (rejected) expect(executeMock.mock.calls[0][1]).toMatchObject({ disposition: "INLINE", format: "ARROW_STREAM", }); - // Second call: JSON (explicit JSON_ARRAY + INLINE) + // Second call: EXTERNAL_LINKS (fallback) expect(executeMock.mock.calls[1][1]).toMatchObject({ - disposition: "INLINE", - format: "JSON_ARRAY", - }); - }); - - test("/query/:query_key should fall back through all formats when each is rejected", async () => { - const plugin = new AnalyticsPlugin(config); - const { router, getHandler } = createMockRouter(); - - (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ - query: "SELECT * FROM test", - isAsUser: false, - }); - - const executeMock = vi - .fn() - .mockRejectedValueOnce( - new Error("INVALID_PARAMETER_VALUE: only supports JSON_ARRAY"), - ) - .mockRejectedValueOnce( - new Error("INVALID_PARAMETER_VALUE: only supports ARROW_STREAM"), - ) - .mockResolvedValueOnce({ - result: { data: [{ id: 1 }] }, - }); - (plugin as any).SQLClient.executeStatement = executeMock; - - plugin.injectRoutes(router); - - const handler = getHandler("POST", "/query/:query_key"); - const mockReq = createMockRequest({ - params: { query_key: "test_query" }, - body: { parameters: {} }, - }); - const mockRes = createMockResponse(); - - await handler(mockReq, mockRes); - - expect(executeMock).toHaveBeenCalledTimes(3); - // Third call: ARROW (EXTERNAL_LINKS) - expect(executeMock.mock.calls[2][1]).toMatchObject({ disposition: "EXTERNAL_LINKS", format: "ARROW_STREAM", }); @@ -828,14 +750,13 @@ describe("Analytics Plugin", () => { const handler = getHandler("POST", "/query/:query_key"); const mockReq = createMockRequest({ params: { query_key: "test_query" }, - body: { parameters: {} }, + body: { parameters: {}, format: "ARROW_STREAM" }, }); const mockRes = createMockResponse(); await handler(mockReq, mockRes); - // All calls use same format (ARROW_STREAM) — no format fallback occurred. - // (executeStream's retry interceptor may retry, but always with the same format.) + // Only one call — non-format error is not retried with different disposition. for (const call of executeMock.mock.calls) { expect(call[1]).toMatchObject({ disposition: "INLINE", @@ -844,7 +765,7 @@ describe("Analytics Plugin", () => { } }); - test("/query/:query_key should not fall back when format is explicitly JSON", async () => { + test("/query/:query_key should not fall back when format is explicitly JSON_ARRAY", async () => { const plugin = new AnalyticsPlugin(config); const { router, getHandler } = createMockRouter(); @@ -865,13 +786,13 @@ describe("Analytics Plugin", () => { const handler = getHandler("POST", "/query/:query_key"); const mockReq = createMockRequest({ params: { query_key: "test_query" }, - body: { parameters: {}, format: "JSON" }, + body: { parameters: {}, format: "JSON_ARRAY" }, }); const mockRes = createMockResponse(); await handler(mockReq, mockRes); - // All calls use JSON_ARRAY + INLINE — explicit JSON, no fallback. + // All calls use JSON_ARRAY + INLINE — explicit JSON_ARRAY, no fallback. for (const call of executeMock.mock.calls) { expect(call[1]).toMatchObject({ disposition: "INLINE", diff --git a/packages/appkit/src/plugins/analytics/types.ts b/packages/appkit/src/plugins/analytics/types.ts index bc7568f9c..c0e72fdba 100644 --- a/packages/appkit/src/plugins/analytics/types.ts +++ b/packages/appkit/src/plugins/analytics/types.ts @@ -4,7 +4,7 @@ export interface IAnalyticsConfig extends BasePluginConfig { timeout?: number; } -export type AnalyticsFormat = "JSON" | "ARROW" | "ARROW_STREAM"; +export type AnalyticsFormat = "JSON_ARRAY" | "ARROW_STREAM"; export interface IAnalyticsQueryRequest { parameters?: Record; format?: AnalyticsFormat; From 2351f38dc87ce45fd193e3d409dbf896de9e1c8a Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Mon, 27 Apr 2026 19:58:28 +0000 Subject: [PATCH 10/17] fix: address ACE multi-model review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes issues found by parallel review across Claude harsh-reviewer, GPT 5.4 xhigh, and Gemini 3.1 Pro on PR #256. - Validate `format` at the route boundary (reject anything other than JSON_ARRAY / ARROW_STREAM with 400) so legacy `"JSON"` / `"ARROW"` callers fail loudly instead of silently falling through to ARROW_STREAM. - Disable cache for ARROW_STREAM queries — EXTERNAL_LINKS pre-signed URLs expire much sooner than the default 1h TTL, so caching the statement_id hands out dead URLs on cache hits. - Re-check `signal.aborted` before issuing the EXTERNAL_LINKS fallback — previously an aborted query would still bill a second statement that the client never reads. - Replace fragile `msg.includes("ARROW_STREAM") || msg.includes("INLINE")` format-error detection with `_isInlineArrowUnsupported()` that requires both keywords plus a marker phrase, or a structured `error_code` of INVALID_PARAMETER_VALUE / NOT_IMPLEMENTED. Reduces false-positives on legitimate SQL/permission errors that mention those words. - Normalize BigInt values from `row.toJSON()` when decoding inline Arrow attachments — `apache-arrow` returns BigInt for INT64/DECIMAL columns, which JSON.stringify rejects when the SSE writer serializes the response. - Cap inline Arrow attachment decode at 64 MiB and wrap decode in try/catch so malformed payloads surface a typed ExecutionError instead of a generic 500. - Detect external_links explicitly in `_transformDataArray` rather than treating any "no attachment + no data_array" response as EXTERNAL_LINKS. Previously an empty INLINE response would return a phantom statement_id the client could not fetch. - Remove the typegen DESCRIBE QUERY fallback that retried without `format` and got ARROW_STREAM responses with no `data_array`, silently degrading generated types to `unknown`. The surrounding allSettled already converts thrown errors to `unknown` types via `generateUnknownResultQuery`. Co-authored-by: Isaac Signed-off-by: James Broadhead --- .../src/connectors/sql-warehouse/client.ts | 69 +++++++++++++--- .../appkit/src/plugins/analytics/analytics.ts | 80 ++++++++++++++----- .../src/type-generator/query-registry.ts | 36 +++------ 3 files changed, 131 insertions(+), 54 deletions(-) diff --git a/packages/appkit/src/connectors/sql-warehouse/client.ts b/packages/appkit/src/connectors/sql-warehouse/client.ts index f844693f6..8a25a9f12 100644 --- a/packages/appkit/src/connectors/sql-warehouse/client.ts +++ b/packages/appkit/src/connectors/sql-warehouse/client.ts @@ -26,6 +26,32 @@ import { executeStatementDefaults } from "./defaults"; const logger = createLogger("connectors:sql-warehouse"); +/** Maximum decoded size for inline Arrow IPC attachments (64 MiB). */ +const MAX_INLINE_ATTACHMENT_BYTES = 64 * 1024 * 1024; + +/** + * Convert Arrow row values to JSON-serializable shapes. + * `apache-arrow` returns `BigInt` for INT64/DECIMAL columns, which `JSON.stringify` + * cannot serialize. Convert BigInts to a Number when in safe-integer range, + * otherwise to a string to preserve precision. `Date` objects serialize fine + * (ISO string) and are left alone. + */ +function normalizeArrowRow( + row: Record, +): Record { + for (const key in row) { + const v = row[key]; + if (typeof v === "bigint") { + row[key] = + v <= BigInt(Number.MAX_SAFE_INTEGER) && + v >= BigInt(Number.MIN_SAFE_INTEGER) + ? Number(v) + : v.toString(); + } + } + return row; +} + interface SQLWarehouseConfig { timeout?: number; telemetry?: TelemetryOptions; @@ -394,20 +420,23 @@ export class SQLWarehouseConnector { private _transformDataArray(response: sql.StatementResponse) { if (response.manifest?.format === "ARROW_STREAM") { - const result = response.result as any; + const result = response.result as + | (sql.ResultData & { attachment?: string }) + | undefined; // Inline Arrow: some warehouses return base64 Arrow IPC in `attachment`. if (result?.attachment) { return this._transformArrowAttachment(response, result.attachment); } - // Inline data_array: fall through to the row transform below. - if (result?.data_array) { - // Fall through. - } else { - // External links: data fetched separately via statement_id. + // External links: data fetched separately via statement_id. + if (result?.external_links) { return this.updateWithArrowStatus(response); } + + // Inline data_array: fall through to the row transform below. + // (Anything else — empty result with no attachment, data_array, or + // external_links — also falls through and produces { data: [] }.) } if (!response.result?.data_array || !response.manifest?.schema?.columns) { @@ -462,10 +491,30 @@ export class SQLWarehouseConnector { response: sql.StatementResponse, attachment: string, ) { - const buf = Buffer.from(attachment, "base64"); - const table = tableFromIPC(buf); - const data = table.toArray().map((row) => row.toJSON()); - const { attachment: _att, ...restResult } = response.result as any; + // Cap the decoded size to protect against unbounded inline payloads from + // misbehaving warehouses. 64 MiB is well above the typical inline limit + // (~16 MiB) but bounds memory if a server returns a runaway response. + const decodedSize = Math.ceil((attachment.length * 3) / 4); + if (decodedSize > MAX_INLINE_ATTACHMENT_BYTES) { + throw ExecutionError.statementFailed( + `Inline Arrow attachment exceeds maximum size (${decodedSize} > ${MAX_INLINE_ATTACHMENT_BYTES} bytes)`, + ); + } + + let data: Record[]; + try { + const buf = Buffer.from(attachment, "base64"); + const table = tableFromIPC(buf); + data = table.toArray().map((row) => normalizeArrowRow(row.toJSON())); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + throw ExecutionError.statementFailed( + `Failed to decode inline Arrow attachment: ${msg}`, + ); + } + const { attachment: _att, ...restResult } = response.result as { + attachment?: string; + } & sql.ResultData; return { ...response, result: { diff --git a/packages/appkit/src/plugins/analytics/analytics.ts b/packages/appkit/src/plugins/analytics/analytics.ts index 3738ea787..e7b5ae15d 100644 --- a/packages/appkit/src/plugins/analytics/analytics.ts +++ b/packages/appkit/src/plugins/analytics/analytics.ts @@ -119,6 +119,13 @@ export class AnalyticsPlugin extends Plugin { const { parameters, format = "JSON_ARRAY" } = req.body as IAnalyticsQueryRequest; + if (format !== "JSON_ARRAY" && format !== "ARROW_STREAM") { + res.status(400).json({ + error: `Invalid format: ${String(format)}. Expected "JSON_ARRAY" or "ARROW_STREAM".`, + }); + return; + } + // Request-scoped logging with WideEvent tracking logger.debug(req, "Executing query: %s (format=%s)", query_key, format); @@ -154,19 +161,27 @@ export class AnalyticsPlugin extends Plugin { const hashedQuery = this.queryProcessor.hashQuery(query); + // ARROW_STREAM may resolve to EXTERNAL_LINKS, which returns pre-signed URLs + // that typically expire well before queryDefaults.cache.ttl. Disable cache + // for ARROW_STREAM to avoid handing out dead URLs from cache. + const cacheConfig = + format === "ARROW_STREAM" + ? { ...queryDefaults.cache, enabled: false } + : { + ...queryDefaults.cache, + cacheKey: [ + "analytics:query", + query_key, + JSON.stringify(parameters), + format, + hashedQuery, + executorKey, + ], + }; + const defaultConfig: PluginExecuteConfig = { ...queryDefaults, - cache: { - ...queryDefaults.cache, - cacheKey: [ - "analytics:query", - query_key, - JSON.stringify(parameters), - JSON.stringify(format), - hashedQuery, - executorKey, - ], - }, + cache: cacheConfig, }; const streamExecutionSettings: StreamExecutionSettings = { @@ -230,17 +245,17 @@ export class AnalyticsPlugin extends Plugin { ); return { type: "result", ...result }; } catch (err: unknown) { - const msg = err instanceof Error ? err.message : String(err); - const isFormatError = - msg.includes("ARROW_STREAM") || - msg.includes("INLINE") || - msg.includes("INVALID_PARAMETER_VALUE") || - msg.includes("NOT_IMPLEMENTED"); + // If the request was aborted, do not retry — the signal is dead and + // a second statement would be billed but never read. + if (signal?.aborted) { + throw err; + } - if (!isFormatError) { + if (!_isInlineArrowUnsupported(err)) { throw err; } + const msg = err instanceof Error ? err.message : String(err); logger.warn( "ARROW_STREAM INLINE rejected by warehouse, falling back to EXTERNAL_LINKS: %s", msg, @@ -326,6 +341,35 @@ export class AnalyticsPlugin extends Plugin { } } +/** + * Determine whether a warehouse error indicates that ARROW_STREAM + INLINE + * is unsupported, vs an unrelated SQL/permission error that happens to mention + * one of the keywords. Requires both "INLINE" and "ARROW_STREAM" in the message + * plus a marker phrase, or a structured `error_code` (e.g. from a wrapped JSON + * response of the form `Response from server (Bad Request) {"error_code":...}`). + */ +function _isInlineArrowUnsupported(err: unknown): boolean { + const msg = err instanceof Error ? err.message : String(err); + + const errorCodeMatch = msg.match(/"error_code"\s*:\s*"([^"]+)"/); + const errorCode = errorCodeMatch?.[1]; + if ( + errorCode === "INVALID_PARAMETER_VALUE" || + errorCode === "NOT_IMPLEMENTED" + ) { + return msg.includes("INLINE") || msg.includes("ARROW_STREAM"); + } + + if (!msg.includes("INLINE") || !msg.includes("ARROW_STREAM")) { + return false; + } + return ( + msg.includes("not supported") || + msg.includes("INVALID_PARAMETER_VALUE") || + msg.includes("NOT_IMPLEMENTED") + ); +} + /** * @internal */ diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts index 4dbdb2596..a950a4a52 100644 --- a/packages/appkit/src/type-generator/query-registry.ts +++ b/packages/appkit/src/type-generator/query-registry.ts @@ -386,32 +386,16 @@ export async function generateQueriesFromDescribe( sqlHash, cleanedSql, }: (typeof uncachedQueries)[number]): Promise => { - let result: DatabricksStatementExecutionResponse; - try { - // Prefer JSON_ARRAY for predictable data_array parsing. - result = (await client.statementExecution.executeStatement({ - statement: `DESCRIBE QUERY ${cleanedSql}`, - warehouse_id: warehouseId, - format: "JSON_ARRAY", - disposition: "INLINE", - })) as DatabricksStatementExecutionResponse; - } catch (err: unknown) { - const msg = err instanceof Error ? err.message : String(err); - if (msg.includes("ARROW_STREAM") || msg.includes("JSON_ARRAY")) { - // Warehouse doesn't support JSON_ARRAY inline — retry with no format - // to let it use its default (typically ARROW_STREAM inline). - logger.debug( - "Warehouse rejected JSON_ARRAY for %s, retrying with default format", - queryName, - ); - result = (await client.statementExecution.executeStatement({ - statement: `DESCRIBE QUERY ${cleanedSql}`, - warehouse_id: warehouseId, - })) as DatabricksStatementExecutionResponse; - } else { - throw err; - } - } + // Always request JSON_ARRAY + INLINE so the downstream caller can parse + // `data_array` predictably. If the warehouse rejects this combination, + // let the error propagate — the surrounding `Promise.allSettled` will + // generate `unknown` types via `generateUnknownResultQuery`. + const result = (await client.statementExecution.executeStatement({ + statement: `DESCRIBE QUERY ${cleanedSql}`, + warehouse_id: warehouseId, + format: "JSON_ARRAY", + disposition: "INLINE", + })) as DatabricksStatementExecutionResponse; completed++; spinner.update( From 997d6a7de7045c6a320dd61e55f480d9778e8388 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Mon, 27 Apr 2026 20:07:50 +0000 Subject: [PATCH 11/17] fix: keep ARROW_STREAM contract consistent across INLINE/EXTERNAL_LINKS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit useAnalyticsQuery({ format: "ARROW_STREAM" }) is typed as TypedArrowTable, but the previous implementation decoded inline IPC attachments to row objects on the server and emitted them as { type: "result", data: rows }. Same requested format produced a TypedArrowTable for EXTERNAL_LINKS warehouses and a plain Row[] for INLINE/serverless warehouses, breaking direct callers that invoked Arrow Table methods. This commit makes ARROW_STREAM always deliver an Arrow Table to the client: - SQL connector forwards the base64 IPC attachment through unchanged (with a 64 MiB size cap) instead of decoding it server-side. - Analytics route emits a new SSE message { type: "arrow_inline", attachment: } when ARROW_STREAM + INLINE returns an attachment. - useAnalyticsQuery decodes the base64 payload via the existing ArrowClient.processArrowBuffer pipeline, producing a Table with the same runtime shape as the EXTERNAL_LINKS path. - Drops apache-arrow from the appkit (server) dependencies — decoding now lives only in appkit-ui where Arrow was already loaded. Tests: - Updated client.test.ts to assert the connector preserves the attachment. - Added a size-cap test for oversized inline attachments. - Added analytics.test.ts coverage for: arrow_inline SSE emission, format validation rejecting unknown values with 400, and aborted-signal short-circuit before fallback retry. - Added use-analytics-query.test.ts (new) covering arrow_inline decoding, decode-failure error path, and JSON_ARRAY result handling. Co-authored-by: Isaac Signed-off-by: James Broadhead --- .../__tests__/use-analytics-query.test.ts | 97 +++++++++++++++++ .../src/react/hooks/use-analytics-query.ts | 31 +++++- packages/appkit/package.json | 1 - .../src/connectors/sql-warehouse/client.ts | 68 +++--------- .../sql-warehouse/tests/client.test.ts | 32 ++++-- .../appkit/src/plugins/analytics/analytics.ts | 7 ++ .../plugins/analytics/tests/analytics.test.ts | 102 ++++++++++++++++++ pnpm-lock.yaml | 3 - 8 files changed, 276 insertions(+), 65 deletions(-) create mode 100644 packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts diff --git a/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts b/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts new file mode 100644 index 000000000..d4bf51010 --- /dev/null +++ b/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts @@ -0,0 +1,97 @@ +import { renderHook, waitFor } from "@testing-library/react"; +import { beforeEach, describe, expect, test, vi } from "vitest"; + +// Capture the onMessage handler so tests can drive SSE messages directly. +let lastConnectArgs: any = null; +const mockProcessArrowBuffer = vi.fn(); +const mockFetchArrow = vi.fn(); + +vi.mock("@/js", () => ({ + connectSSE: vi.fn((args: any) => { + lastConnectArgs = args; + return () => {}; + }), + ArrowClient: { + fetchArrow: (...args: unknown[]) => mockFetchArrow(...args), + processArrowBuffer: (...args: unknown[]) => mockProcessArrowBuffer(...args), + }, +})); + +// useQueryHMR is a no-op shim for tests; mock to avoid HMR side effects. +vi.mock("../use-query-hmr", () => ({ + useQueryHMR: vi.fn(), +})); + +import { useAnalyticsQuery } from "../use-analytics-query"; + +describe("useAnalyticsQuery", () => { + beforeEach(() => { + vi.clearAllMocks(); + lastConnectArgs = null; + }); + + test("decodes arrow_inline base64 attachment via ArrowClient.processArrowBuffer", async () => { + const fakeTable = { numRows: 1, schema: { fields: [] } }; + mockProcessArrowBuffer.mockResolvedValueOnce(fakeTable); + + // 'AQID' decodes to bytes [1, 2, 3]. + const base64 = "AQID"; + + const { result } = renderHook(() => + useAnalyticsQuery("q", null, { format: "ARROW_STREAM" }), + ); + + // Drive the SSE onMessage handler with an arrow_inline payload. + await lastConnectArgs.onMessage({ + data: JSON.stringify({ type: "arrow_inline", attachment: base64 }), + }); + + await waitFor(() => { + expect(result.current.data).toBe(fakeTable); + }); + + expect(mockProcessArrowBuffer).toHaveBeenCalledTimes(1); + const passedBuffer = mockProcessArrowBuffer.mock.calls[0][0] as Uint8Array; + expect(passedBuffer).toBeInstanceOf(Uint8Array); + expect(Array.from(passedBuffer)).toEqual([1, 2, 3]); + // Inline path must NOT trigger a network fetch. + expect(mockFetchArrow).not.toHaveBeenCalled(); + }); + + test("surfaces an error when arrow_inline decode fails", async () => { + mockProcessArrowBuffer.mockRejectedValueOnce(new Error("bad ipc")); + + const { result } = renderHook(() => + useAnalyticsQuery("q", null, { format: "ARROW_STREAM" }), + ); + + await lastConnectArgs.onMessage({ + data: JSON.stringify({ type: "arrow_inline", attachment: "AQID" }), + }); + + await waitFor(() => { + expect(result.current.error).toBe( + "Unable to load data, please try again", + ); + }); + expect(result.current.loading).toBe(false); + }); + + test("still handles type:result rows for JSON_ARRAY", async () => { + const { result } = renderHook(() => + useAnalyticsQuery("q", null, { format: "JSON_ARRAY" }), + ); + + await lastConnectArgs.onMessage({ + data: JSON.stringify({ + type: "result", + data: [{ id: 1 }, { id: 2 }], + }), + }); + + await waitFor(() => { + expect(result.current.data).toEqual([{ id: 1 }, { id: 2 }]); + }); + expect(mockProcessArrowBuffer).not.toHaveBeenCalled(); + }); +}); diff --git a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts index 314bd6e4c..3a3aa8789 100644 --- a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts +++ b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts @@ -22,6 +22,16 @@ function getArrowStreamUrl(id: string) { return `/api/analytics/arrow-result/${id}`; } +/** Decode a base64 string into a Uint8Array suitable for Arrow IPC parsing. */ +function decodeBase64(b64: string): Uint8Array { + const binary = atob(b64); + const bytes = new Uint8Array(binary.length); + for (let i = 0; i < binary.length; i++) { + bytes[i] = binary.charCodeAt(i); + } + return bytes; +} + /** * Subscribe to an analytics query over SSE and returns its latest result. * Integration hook between client and analytics plugin. @@ -129,7 +139,7 @@ export function useAnalyticsQuery< return; } - // success - Arrow format + // success - Arrow format (external links: fetch from server) if (parsed.type === "arrow") { try { const arrowData = await ArrowClient.fetchArrow( @@ -151,6 +161,25 @@ export function useAnalyticsQuery< } } + // success - Arrow format (inline: decode base64 IPC payload locally) + if (parsed.type === "arrow_inline") { + try { + const buffer = decodeBase64(parsed.attachment); + const table = await ArrowClient.processArrowBuffer(buffer); + setLoading(false); + setData(table as ResultType); + return; + } catch (error) { + console.error( + "[useAnalyticsQuery] Failed to decode inline Arrow data", + error, + ); + setLoading(false); + setError("Unable to load data, please try again"); + return; + } + } + // error if (parsed.type === "error" || parsed.error || parsed.code) { const errorMsg = diff --git a/packages/appkit/package.json b/packages/appkit/package.json index 04232f88b..3b57014c0 100644 --- a/packages/appkit/package.json +++ b/packages/appkit/package.json @@ -69,7 +69,6 @@ "@opentelemetry/sdk-trace-base": "2.6.0", "@opentelemetry/semantic-conventions": "1.38.0", "@types/semver": "7.7.1", - "apache-arrow": "21.1.0", "dotenv": "16.6.1", "express": "4.22.0", "obug": "2.1.1", diff --git a/packages/appkit/src/connectors/sql-warehouse/client.ts b/packages/appkit/src/connectors/sql-warehouse/client.ts index 8a25a9f12..050e5cd8a 100644 --- a/packages/appkit/src/connectors/sql-warehouse/client.ts +++ b/packages/appkit/src/connectors/sql-warehouse/client.ts @@ -3,7 +3,6 @@ import { type sql, type WorkspaceClient, } from "@databricks/sdk-experimental"; -import { tableFromIPC } from "apache-arrow"; import type { TelemetryOptions } from "shared"; import { AppKitError, @@ -26,32 +25,9 @@ import { executeStatementDefaults } from "./defaults"; const logger = createLogger("connectors:sql-warehouse"); -/** Maximum decoded size for inline Arrow IPC attachments (64 MiB). */ +/** Maximum size for inline Arrow IPC attachments (64 MiB decoded). */ const MAX_INLINE_ATTACHMENT_BYTES = 64 * 1024 * 1024; -/** - * Convert Arrow row values to JSON-serializable shapes. - * `apache-arrow` returns `BigInt` for INT64/DECIMAL columns, which `JSON.stringify` - * cannot serialize. Convert BigInts to a Number when in safe-integer range, - * otherwise to a string to preserve precision. `Date` objects serialize fine - * (ISO string) and are left alone. - */ -function normalizeArrowRow( - row: Record, -): Record { - for (const key in row) { - const v = row[key]; - if (typeof v === "bigint") { - row[key] = - v <= BigInt(Number.MAX_SAFE_INTEGER) && - v >= BigInt(Number.MIN_SAFE_INTEGER) - ? Number(v) - : v.toString(); - } - } - return row; -} - interface SQLWarehouseConfig { timeout?: number; telemetry?: TelemetryOptions; @@ -424,9 +400,12 @@ export class SQLWarehouseConnector { | (sql.ResultData & { attachment?: string }) | undefined; - // Inline Arrow: some warehouses return base64 Arrow IPC in `attachment`. + // Inline Arrow: pass the base64 IPC attachment through unmodified so + // the analytics route can stream it to the client, where the existing + // ArrowClient infrastructure decodes it into a Table. Validate size + // here to fail fast on runaway payloads. if (result?.attachment) { - return this._transformArrowAttachment(response, result.attachment); + return this._validateArrowAttachment(response, result.attachment); } // External links: data fetched separately via statement_id. @@ -483,15 +462,19 @@ export class SQLWarehouseConnector { } /** - * Decode a base64 Arrow IPC attachment into row objects. + * Validate (but do not decode) a base64 Arrow IPC attachment. * Some serverless warehouses return inline results as Arrow IPC in - * `result.attachment` rather than `result.data_array`. + * `result.attachment`. We pass the base64 string through to the client, + * which decodes it into an Arrow Table via the existing ArrowClient + * infrastructure. This keeps the wire contract for ARROW_STREAM + * consistent (client always receives an Arrow Table) and avoids + * decode/re-encode work on the server. */ - private _transformArrowAttachment( + private _validateArrowAttachment( response: sql.StatementResponse, attachment: string, ) { - // Cap the decoded size to protect against unbounded inline payloads from + // Cap the size to protect against unbounded inline payloads from // misbehaving warehouses. 64 MiB is well above the typical inline limit // (~16 MiB) but bounds memory if a server returns a runaway response. const decodedSize = Math.ceil((attachment.length * 3) / 4); @@ -500,28 +483,7 @@ export class SQLWarehouseConnector { `Inline Arrow attachment exceeds maximum size (${decodedSize} > ${MAX_INLINE_ATTACHMENT_BYTES} bytes)`, ); } - - let data: Record[]; - try { - const buf = Buffer.from(attachment, "base64"); - const table = tableFromIPC(buf); - data = table.toArray().map((row) => normalizeArrowRow(row.toJSON())); - } catch (err: unknown) { - const msg = err instanceof Error ? err.message : String(err); - throw ExecutionError.statementFailed( - `Failed to decode inline Arrow attachment: ${msg}`, - ); - } - const { attachment: _att, ...restResult } = response.result as { - attachment?: string; - } & sql.ResultData; - return { - ...response, - result: { - ...restResult, - data, - }, - }; + return response; } private updateWithArrowStatus(response: sql.StatementResponse): { diff --git a/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts b/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts index 73bc8cda3..5f779ff05 100644 --- a/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts +++ b/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts @@ -139,7 +139,7 @@ describe("SQLWarehouseConnector._transformDataArray", () => { }); describe("serverless warehouse (INLINE + ARROW_STREAM with attachment)", () => { - test("decodes base64 Arrow IPC attachment into row objects", () => { + test("passes attachment through unchanged for client-side decoding", () => { const connector = createConnector(); // Real response shape from serverless warehouse: INLINE + ARROW_STREAM // Data arrives in result.attachment as base64-encoded Arrow IPC, not data_array. @@ -179,13 +179,13 @@ describe("SQLWarehouseConnector._transformDataArray", () => { } as unknown as sql.StatementResponse; const result = (connector as any)._transformDataArray(response); - expect(result.result.data).toEqual([{ test_col: 1, test_col2: 2 }]); - expect(result.result.attachment).toBeUndefined(); + expect(result.result.attachment).toBe(REAL_ARROW_ATTACHMENT); + expect(result.result.data).toBeUndefined(); // Preserves other result fields expect(result.result.row_count).toBe(1); }); - test("preserves manifest and status alongside decoded data", () => { + test("preserves manifest and status alongside attachment", () => { const connector = createConnector(); const response = { statement_id: "00000001-test-stmt", @@ -207,9 +207,26 @@ describe("SQLWarehouseConnector._transformDataArray", () => { } as unknown as sql.StatementResponse; const result = (connector as any)._transformDataArray(response); - // Manifest and statement_id are preserved + // Manifest, statement_id, and attachment are all preserved expect(result.manifest.format).toBe("ARROW_STREAM"); expect(result.statement_id).toBe("00000001-test-stmt"); + expect(result.result.attachment).toBe(REAL_ARROW_ATTACHMENT); + }); + + test("rejects oversized attachments to bound memory", () => { + const connector = createConnector(); + // 64 MiB cap → ~85 MiB of base64 chars decode to >64 MiB. + const oversized = "A".repeat(90 * 1024 * 1024); + const response = { + statement_id: "stmt-oversized", + status: { state: "SUCCEEDED" }, + manifest: { format: "ARROW_STREAM" }, + result: { attachment: oversized }, + } as unknown as sql.StatementResponse; + + expect(() => (connector as any)._transformDataArray(response)).toThrow( + /exceeds maximum size/, + ); }); }); @@ -279,8 +296,9 @@ describe("SQLWarehouseConnector._transformDataArray", () => { } as unknown as sql.StatementResponse; const result = (connector as any)._transformDataArray(response); - // Should use attachment (Arrow IPC), not data_array - expect(result.result.data).toEqual([{ test_col: 1, test_col2: 2 }]); + // Should pass attachment through (client decodes), not transform data_array + expect(result.result.attachment).toBe(REAL_ARROW_ATTACHMENT); + expect(result.result.data).toBeUndefined(); }); }); }); diff --git a/packages/appkit/src/plugins/analytics/analytics.ts b/packages/appkit/src/plugins/analytics/analytics.ts index e7b5ae15d..b26dd4c01 100644 --- a/packages/appkit/src/plugins/analytics/analytics.ts +++ b/packages/appkit/src/plugins/analytics/analytics.ts @@ -243,6 +243,13 @@ export class AnalyticsPlugin extends Plugin { { disposition: "INLINE", format: "ARROW_STREAM" }, signal, ); + // INLINE responses with an Arrow IPC attachment are forwarded as base64 + // for the client to decode into an Arrow Table. Anything else (rare: + // data_array under ARROW_STREAM, or an empty result) falls back to the + // generic "result" payload. + if (result?.attachment) { + return { type: "arrow_inline", attachment: result.attachment }; + } return { type: "result", ...result }; } catch (err: unknown) { // If the request was aborted, do not retry — the signal is dead and diff --git a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts index 5a477763f..643ee2ca9 100644 --- a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts +++ b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts @@ -765,6 +765,108 @@ describe("Analytics Plugin", () => { } }); + test("/query/:query_key emits arrow_inline SSE event when ARROW_STREAM INLINE returns an attachment", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const fakeAttachment = "BASE64_ARROW_IPC_BYTES"; + const executeMock = vi.fn().mockResolvedValue({ + result: { attachment: fakeAttachment, row_count: 1 }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW_STREAM" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // The route should not fall back to EXTERNAL_LINKS — INLINE succeeded. + expect(executeMock).toHaveBeenCalledTimes(1); + expect(executeMock.mock.calls[0][1]).toMatchObject({ + disposition: "INLINE", + format: "ARROW_STREAM", + }); + // SSE payload should use the new arrow_inline message type. + const writeCalls = (mockRes.write as any).mock.calls.map( + (c: any[]) => c[0] as string, + ); + const payload = writeCalls.find((s: string) => s.startsWith("data: ")); + expect(payload).toBeDefined(); + expect(payload).toContain('"type":"arrow_inline"'); + expect(payload).toContain(`"attachment":"${fakeAttachment}"`); + }); + + test("/query/:query_key rejects unknown format values with 400", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + const executeMock = vi.fn(); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "JSON" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + expect(mockRes.status).toHaveBeenCalledWith(400); + expect(executeMock).not.toHaveBeenCalled(); + }); + + test("/query/:query_key does not retry the fallback when the request was aborted", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + const executeMock = vi.fn().mockImplementation((_wc, _opts, signal) => { + // Simulate a signal that becomes aborted before the failure surfaces — + // e.g. the client cancelled the SSE stream mid-query. + signal?.dispatchEvent?.(new Event("abort")); + Object.defineProperty(signal, "aborted", { value: true }); + return Promise.reject( + new Error( + "INVALID_PARAMETER_VALUE: ARROW_STREAM not supported with INLINE disposition", + ), + ); + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW_STREAM" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // Even though the error message would normally trigger fallback, the + // aborted signal should short-circuit and prevent a second statement. + expect(executeMock).toHaveBeenCalledTimes(1); + }); + test("/query/:query_key should not fall back when format is explicitly JSON_ARRAY", async () => { const plugin = new AnalyticsPlugin(config); const { router, getHandler } = createMockRouter(); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 46096f433..54512501d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -299,9 +299,6 @@ importers: '@types/semver': specifier: 7.7.1 version: 7.7.1 - apache-arrow: - specifier: 21.1.0 - version: 21.1.0 dotenv: specifier: 16.6.1 version: 16.6.1 From 694feedf0162b3c03f1076b4ec7e1c27e213a6e7 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Mon, 27 Apr 2026 20:33:19 +0000 Subject: [PATCH 12/17] test: drop unrelated files-plugin upload tests from this PR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit upload-and-write.test.ts (added in 7164d3b) covers FilesPlugin internals (_handleApiError mapping, upload stream size enforcement, cache invalidation, etc.) — entirely separate from this PR's analytics/Arrow scope. The file ships with 9 failing tests against the current files plugin (assertion mismatches on HTTP status codes, Readable.toWeb mock type errors) that block CI here. Removing it from this PR. The coverage push will come back as its own PR where the failures can be properly debugged. Co-authored-by: Isaac Signed-off-by: James Broadhead --- .../files/tests/upload-and-write.test.ts | 1245 ----------------- 1 file changed, 1245 deletions(-) delete mode 100644 packages/appkit/src/plugins/files/tests/upload-and-write.test.ts diff --git a/packages/appkit/src/plugins/files/tests/upload-and-write.test.ts b/packages/appkit/src/plugins/files/tests/upload-and-write.test.ts deleted file mode 100644 index 8da3f021c..000000000 --- a/packages/appkit/src/plugins/files/tests/upload-and-write.test.ts +++ /dev/null @@ -1,1245 +0,0 @@ -import { Readable } from "node:stream"; -import { mockServiceContext, setupDatabricksEnv } from "@tools/test-helpers"; -import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; -import { ServiceContext } from "../../../context/service-context"; -import { AuthenticationError } from "../../../errors"; -import { FilesPlugin } from "../plugin"; - -const { mockClient, MockApiError, mockCacheInstance } = vi.hoisted(() => { - const mockFilesApi = { - listDirectoryContents: vi.fn(), - download: vi.fn(), - getMetadata: vi.fn(), - upload: vi.fn(), - createDirectory: vi.fn(), - delete: vi.fn(), - }; - - const mockClient = { - files: mockFilesApi, - config: { - host: "https://test.databricks.com", - authenticate: vi.fn(), - }, - }; - - class MockApiError extends Error { - statusCode: number; - constructor(message: string, statusCode: number) { - super(message); - this.name = "ApiError"; - this.statusCode = statusCode; - } - } - - const mockCacheInstance = { - get: vi.fn(), - set: vi.fn(), - delete: vi.fn(), - getOrExecute: vi.fn(async (_key: unknown[], fn: () => Promise) => - fn(), - ), - generateKey: vi.fn((...args: unknown[]) => JSON.stringify(args)), - }; - - return { mockFilesApi, mockClient, MockApiError, mockCacheInstance }; -}); - -vi.mock("@databricks/sdk-experimental", () => ({ - WorkspaceClient: vi.fn(() => mockClient), - ApiError: MockApiError, -})); - -vi.mock("../../../context", async (importOriginal) => { - const actual = await importOriginal(); - return { - ...actual, - getWorkspaceClient: vi.fn(() => mockClient), - isInUserContext: vi.fn(() => true), - }; -}); - -vi.mock("../../../cache", () => ({ - CacheManager: { - getInstanceSync: vi.fn(() => mockCacheInstance), - }, -})); - -const VOLUMES_CONFIG = { - volumes: { - uploads: { maxUploadSize: 100_000_000 }, - exports: {}, - }, -}; - -/** - * Helper to get a route handler from the plugin. Registers routes on a mock - * router and returns the handler matching the given method + path suffix. - */ -function getRouteHandler( - plugin: FilesPlugin, - method: "get" | "post" | "delete", - pathSuffix: string, -) { - const mockRouter = { - use: vi.fn(), - get: vi.fn(), - post: vi.fn(), - put: vi.fn(), - delete: vi.fn(), - patch: vi.fn(), - } as any; - - plugin.injectRoutes(mockRouter); - - const call = mockRouter[method].mock.calls.find( - (c: unknown[]) => - typeof c[0] === "string" && (c[0] as string).endsWith(pathSuffix), - ); - if (!call) throw new Error(`No route found for ${method} ...${pathSuffix}`); - return call[call.length - 1] as (req: any, res: any) => Promise; -} - -/** - * Creates a mock Express response with all methods needed by the route handlers. - */ -function mockRes() { - const res: any = { - headersSent: false, - }; - res.status = vi.fn().mockReturnValue(res); - res.json = vi.fn().mockReturnValue(res); - res.type = vi.fn().mockReturnValue(res); - res.send = vi.fn().mockReturnValue(res); - res.setHeader = vi.fn().mockReturnValue(res); - res.write = vi.fn().mockReturnValue(true); - res.destroy = vi.fn(); - res.end = vi.fn(); - res.on = vi.fn().mockReturnValue(res); - res.once = vi.fn().mockReturnValue(res); - res.emit = vi.fn().mockReturnValue(true); - res.removeListener = vi.fn().mockReturnValue(res); - res.pipe = vi.fn().mockReturnValue(res); - return res; -} - -/** - * Creates a mock Express request with the auth headers needed by the plugin's - * `asUser()` proxy. - */ -function mockReq(volumeKey: string, overrides: Record = {}): any { - const headers: Record = { - "x-forwarded-access-token": "test-token", - "x-forwarded-user": "test-user", - ...(overrides.headers ?? {}), - }; - - const req: any = { - params: { volumeKey }, - query: {}, - ...overrides, - headers, - header: (name: string) => headers[name.toLowerCase()], - }; - - return req; -} - -/** - * Creates a mock Express request that behaves as a Node Readable stream, - * suitable for the upload handler which calls Readable.toWeb(req). - */ -function mockUploadReq( - volumeKey: string, - bodyChunks: Buffer[], - overrides: Record = {}, -): any { - const headers: Record = { - "x-forwarded-access-token": "test-token", - "x-forwarded-user": "test-user", - ...(overrides.headers ?? {}), - }; - - // Create a real Node Readable so Readable.toWeb() works - let chunkIndex = 0; - const stream = new Readable({ - read() { - if (chunkIndex < bodyChunks.length) { - this.push(bodyChunks[chunkIndex++]); - } else { - this.push(null); - } - }, - }); - - // Patch stream with Express request properties - (stream as any).params = { volumeKey }; - (stream as any).query = overrides.query ?? {}; - (stream as any).headers = headers; - (stream as any).header = (name: string) => headers[name.toLowerCase()]; - (stream as any).body = overrides.body; - - return stream; -} - -describe("FilesPlugin - Upload, Write, and Error Handling", () => { - let serviceContextMock: Awaited>; - - beforeEach(async () => { - vi.clearAllMocks(); - setupDatabricksEnv(); - ServiceContext.reset(); - process.env.DATABRICKS_VOLUME_UPLOADS = "/Volumes/catalog/schema/uploads"; - process.env.DATABRICKS_VOLUME_EXPORTS = "/Volumes/catalog/schema/exports"; - serviceContextMock = await mockServiceContext(); - }); - - afterEach(() => { - serviceContextMock?.restore(); - delete process.env.DATABRICKS_VOLUME_UPLOADS; - delete process.env.DATABRICKS_VOLUME_EXPORTS; - }); - - // ────────────────────────────────────────────────────────────────────── - // 1. _handleApiError: AuthenticationError -> 401, ApiError variants, - // non-ApiError -> 500 - // ────────────────────────────────────────────────────────────────────── - describe("_handleApiError", () => { - test("AuthenticationError returns 401 with error message", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const res = mockRes(); - - (plugin as any)._handleApiError( - res, - new AuthenticationError("Missing token"), - "fallback msg", - ); - - expect(res.status).toHaveBeenCalledWith(401); - expect(res.json).toHaveBeenCalledWith({ - error: "Missing token", - plugin: "files", - }); - }); - - test("ApiError with 4xx status preserves status and message", () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const res = mockRes(); - - (plugin as any)._handleApiError( - res, - new MockApiError("Forbidden", 403), - "fallback msg", - ); - - expect(res.status).toHaveBeenCalledWith(403); - expect(res.json).toHaveBeenCalledWith({ - error: "Forbidden", - statusCode: 403, - plugin: "files", - }); - }); - - test("ApiError with 404 preserves status", () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const res = mockRes(); - - (plugin as any)._handleApiError( - res, - new MockApiError("Not found", 404), - "fallback msg", - ); - - expect(res.status).toHaveBeenCalledWith(404); - expect(res.json).toHaveBeenCalledWith({ - error: "Not found", - statusCode: 404, - plugin: "files", - }); - }); - - test("ApiError with 409 Conflict preserves status", () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const res = mockRes(); - - (plugin as any)._handleApiError( - res, - new MockApiError("Conflict", 409), - "fallback msg", - ); - - expect(res.status).toHaveBeenCalledWith(409); - expect(res.json).toHaveBeenCalledWith({ - error: "Conflict", - statusCode: 409, - plugin: "files", - }); - }); - - test("ApiError with 5xx returns 500 with fallback message", () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const res = mockRes(); - - (plugin as any)._handleApiError( - res, - new MockApiError("Bad Gateway", 502), - "Operation failed", - ); - - expect(res.status).toHaveBeenCalledWith(500); - expect(res.json).toHaveBeenCalledWith({ - error: "Operation failed", - plugin: "files", - }); - }); - - test("ApiError with statusCode 500 returns 500 with fallback", () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const res = mockRes(); - - (plugin as any)._handleApiError( - res, - new MockApiError("Internal error", 500), - "Fallback", - ); - - expect(res.status).toHaveBeenCalledWith(500); - expect(res.json).toHaveBeenCalledWith({ - error: "Fallback", - plugin: "files", - }); - }); - - test("non-ApiError falls back to 500 with fallback message", () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const res = mockRes(); - - (plugin as any)._handleApiError(res, new Error("unknown"), "Fallback"); - - expect(res.status).toHaveBeenCalledWith(500); - expect(res.json).toHaveBeenCalledWith({ - error: "Fallback", - plugin: "files", - }); - }); - - test("non-ApiError exception returns 500 with fallback message", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const res = mockRes(); - - (plugin as any)._handleApiError( - res, - new TypeError("Cannot read properties of undefined"), - "Internal Server Error", - ); - - expect(res.status).toHaveBeenCalledWith(500); - expect(res.json).toHaveBeenCalledWith({ - error: "Internal Server Error", - plugin: "files", - }); - }); - - test("AuthenticationError via route (missing token in production)", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/list"); - const res = mockRes(); - - const originalEnv = process.env.NODE_ENV; - process.env.NODE_ENV = "production"; - - try { - await handler( - { - params: { volumeKey: "uploads" }, - query: {}, - headers: {}, - header: () => undefined, - }, - res, - ); - - expect(res.status).toHaveBeenCalledWith(401); - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ - error: expect.stringContaining("token"), - plugin: "files", - }), - ); - } finally { - process.env.NODE_ENV = originalEnv; - } - }); - }); - - // ────────────────────────────────────────────────────────────────────── - // 2. Upload path: TransformStream size enforcement during streaming - // ────────────────────────────────────────────────────────────────────── - describe("Upload stream mid-transfer size enforcement", () => { - test("upload exceeding size mid-stream is caught by execute and returns error", async () => { - const plugin = new FilesPlugin({ - volumes: { - uploads: { maxUploadSize: 50 }, - }, - }); - const handler = getRouteHandler(plugin, "post", "/upload"); - const res = mockRes(); - - // Two chunks: 30 + 30 = 60 > maxSize of 50 - const req = mockUploadReq( - "uploads", - [Buffer.alloc(30), Buffer.alloc(30)], - { - query: { path: "/Volumes/catalog/schema/uploads/file.bin" }, - // No content-length header so the pre-check does not catch it - }, - ); - - // Spy on the connector's upload to consume the stream (the - // TransformStream size limiter fires when chunks are read). - const connector = (plugin as any).volumeConnectors.uploads; - vi.spyOn(connector, "upload").mockImplementation( - async (_client: any, _path: string, contents: any) => { - const reader = (contents as ReadableStream).getReader(); - while (true) { - const { done } = await reader.read(); - if (done) break; - } - }, - ); - - await handler(req, res); - - // The stream size error is caught by execute() and returned as - // {ok: false, status: 500}. The Content-Length pre-check (tested - // separately) catches oversized uploads before streaming starts. - const statusCalls = res.status.mock.calls.flat(); - expect(statusCalls).toContain(500); - }); - - test("outer catch returns 413 for stream size error escaping execute", async () => { - // The outer catch in _handleUpload has a specific check for the - // "exceeds maximum allowed size" message. This tests that path by - // making execute() re-throw instead of catching. - const plugin = new FilesPlugin({ - volumes: { - uploads: { maxUploadSize: 50 }, - }, - }); - const handler = getRouteHandler(plugin, "post", "/upload"); - const res = mockRes(); - - const req = mockUploadReq("uploads", [Buffer.from("data")], { - query: { path: "/Volumes/catalog/schema/uploads/file.bin" }, - }); - - // Override trackWrite to throw the size error directly - vi.spyOn(plugin as any, "trackWrite").mockRejectedValue( - new Error("Upload stream exceeds maximum allowed size (50 bytes)"), - ); - - await handler(req, res); - - expect(res.status).toHaveBeenCalledWith(413); - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ - error: expect.stringContaining("exceeds maximum allowed size"), - plugin: "files", - }), - ); - }); - - test("upload within size limit succeeds", async () => { - const plugin = new FilesPlugin({ - volumes: { - uploads: { maxUploadSize: 100 }, - }, - }); - const handler = getRouteHandler(plugin, "post", "/upload"); - const res = mockRes(); - - const req = mockUploadReq( - "uploads", - [Buffer.from("small file content")], - { - query: { path: "/Volumes/catalog/schema/uploads/small.txt" }, - }, - ); - - const connector = (plugin as any).volumeConnectors.uploads; - vi.spyOn(connector, "upload").mockImplementation( - async (_client: any, _path: string, contents: any) => { - const reader = (contents as ReadableStream).getReader(); - while (true) { - const { done } = await reader.read(); - if (done) break; - } - }, - ); - - await handler(req, res); - - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ success: true }), - ); - }); - }); - - // ────────────────────────────────────────────────────────────────────── - // 3. Upload: cache invalidation after successful upload - // ────────────────────────────────────────────────────────────────────── - describe("Upload cache invalidation", () => { - test("successful upload calls cache.delete for parent directory", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "post", "/upload"); - const res = mockRes(); - - const req = mockUploadReq("uploads", [Buffer.from("file content")], { - query: { path: "/Volumes/catalog/schema/uploads/dir/file.txt" }, - }); - - const connector = (plugin as any).volumeConnectors.uploads; - vi.spyOn(connector, "upload").mockImplementation( - async (_client: any, _path: string, contents: any) => { - const reader = (contents as ReadableStream).getReader(); - while (true) { - const { done } = await reader.read(); - if (done) break; - } - }, - ); - - await handler(req, res); - - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ success: true }), - ); - // _invalidateListCache should call generateKey and then delete - expect(mockCacheInstance.generateKey).toHaveBeenCalled(); - expect(mockCacheInstance.delete).toHaveBeenCalled(); - }); - }); - - // ────────────────────────────────────────────────────────────────────── - // 4. Raw endpoint: CSP sandbox header and safe vs unsafe content type - // ────────────────────────────────────────────────────────────────────── - describe("Raw endpoint security headers", () => { - function makeStreamResponse(content: string) { - const stream = new ReadableStream({ - start(controller) { - controller.enqueue(new TextEncoder().encode(content)); - controller.close(); - }, - }); - return { contents: stream }; - } - - test("raw endpoint sets CSP sandbox header", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/raw"); - const res = mockRes(); - - mockClient.files.download.mockResolvedValue(makeStreamResponse("data")); - - await handler( - mockReq("uploads", { - query: { path: "/Volumes/catalog/schema/uploads/data.json" }, - }), - res, - ); - - expect(res.setHeader).toHaveBeenCalledWith( - "Content-Security-Policy", - "sandbox", - ); - }); - - test("raw endpoint with safe content type (image/png) does not set Content-Disposition", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/raw"); - const res = mockRes(); - - mockClient.files.download.mockResolvedValue( - makeStreamResponse("PNG data"), - ); - - await handler( - mockReq("uploads", { - query: { path: "/Volumes/catalog/schema/uploads/image.png" }, - }), - res, - ); - - expect(res.setHeader).toHaveBeenCalledWith("Content-Type", "image/png"); - expect(res.setHeader).toHaveBeenCalledWith( - "Content-Security-Policy", - "sandbox", - ); - - // Content-Disposition should NOT be set for safe inline types - const dispositionCalls = res.setHeader.mock.calls.filter( - (c: string[]) => c[0] === "Content-Disposition", - ); - expect(dispositionCalls).toHaveLength(0); - }); - - test("raw endpoint with unsafe content type (text/html) forces download", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/raw"); - const res = mockRes(); - - mockClient.files.download.mockResolvedValue( - makeStreamResponse(""), - ); - - await handler( - mockReq("uploads", { - query: { path: "/Volumes/catalog/schema/uploads/page.html" }, - }), - res, - ); - - expect(res.setHeader).toHaveBeenCalledWith("Content-Type", "text/html"); - expect(res.setHeader).toHaveBeenCalledWith( - "Content-Security-Policy", - "sandbox", - ); - expect(res.setHeader).toHaveBeenCalledWith( - "Content-Disposition", - 'attachment; filename="page.html"', - ); - }); - - test("raw endpoint with SVG forces download", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/raw"); - const res = mockRes(); - - mockClient.files.download.mockResolvedValue( - makeStreamResponse(""), - ); - - await handler( - mockReq("uploads", { - query: { path: "/Volumes/catalog/schema/uploads/icon.svg" }, - }), - res, - ); - - expect(res.setHeader).toHaveBeenCalledWith( - "Content-Disposition", - 'attachment; filename="icon.svg"', - ); - }); - - test("raw endpoint sets X-Content-Type-Options: nosniff", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/raw"); - const res = mockRes(); - - mockClient.files.download.mockResolvedValue( - makeStreamResponse("content"), - ); - - await handler( - mockReq("uploads", { - query: { path: "/Volumes/catalog/schema/uploads/file.txt" }, - }), - res, - ); - - expect(res.setHeader).toHaveBeenCalledWith( - "X-Content-Type-Options", - "nosniff", - ); - }); - - test("raw endpoint with missing path returns 400", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/raw"); - const res = mockRes(); - - await handler(mockReq("uploads", { query: {} }), res); - - expect(res.status).toHaveBeenCalledWith(400); - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ - error: "path is required", - plugin: "files", - }), - ); - }); - }); - - // ────────────────────────────────────────────────────────────────────── - // 5. Download endpoint: Content-Disposition with sanitized filename - // ────────────────────────────────────────────────────────────────────── - describe("Download endpoint Content-Disposition", () => { - function makeStreamResponse(content: string) { - const stream = new ReadableStream({ - start(controller) { - controller.enqueue(new TextEncoder().encode(content)); - controller.close(); - }, - }); - return { contents: stream }; - } - - test("download sets Content-Disposition: attachment with filename", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/download"); - const res = mockRes(); - - mockClient.files.download.mockResolvedValue( - makeStreamResponse("file data"), - ); - - await handler( - mockReq("uploads", { - query: { path: "/Volumes/catalog/schema/uploads/report.pdf" }, - }), - res, - ); - - expect(res.setHeader).toHaveBeenCalledWith( - "Content-Disposition", - 'attachment; filename="report.pdf"', - ); - }); - - test("download sanitizes filename with special characters", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/download"); - const res = mockRes(); - - mockClient.files.download.mockResolvedValue(makeStreamResponse("data")); - - await handler( - mockReq("uploads", { - query: { path: '/Volumes/catalog/schema/uploads/my "file".txt' }, - }), - res, - ); - - // Quotes in filenames should be escaped - expect(res.setHeader).toHaveBeenCalledWith( - "Content-Disposition", - 'attachment; filename="my \\"file\\".txt"', - ); - }); - - test("download always sets Content-Disposition even for safe types", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/download"); - const res = mockRes(); - - mockClient.files.download.mockResolvedValue(makeStreamResponse("{}")); - - await handler( - mockReq("uploads", { - query: { path: "/Volumes/catalog/schema/uploads/data.json" }, - }), - res, - ); - - // Download mode always forces attachment, even for safe types - expect(res.setHeader).toHaveBeenCalledWith( - "Content-Disposition", - 'attachment; filename="data.json"', - ); - }); - - test("download with missing path returns 400", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/download"); - const res = mockRes(); - - await handler(mockReq("uploads", { query: {} }), res); - - expect(res.status).toHaveBeenCalledWith(400); - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ - error: "path is required", - plugin: "files", - }), - ); - }); - - test("download with response having no contents calls res.end()", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/download"); - const res = mockRes(); - - // Response with no contents field (empty file) - mockClient.files.download.mockResolvedValue({}); - - await handler( - mockReq("uploads", { - query: { path: "/Volumes/catalog/schema/uploads/empty.txt" }, - }), - res, - ); - - expect(res.end).toHaveBeenCalled(); - }); - }); - - // ────────────────────────────────────────────────────────────────────── - // 6. Delete endpoint: cache invalidation - // ────────────────────────────────────────────────────────────────────── - describe("Delete cache invalidation", () => { - test("successful delete invalidates list cache", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "delete", ""); - const res = mockRes(); - - mockClient.files.delete.mockResolvedValue(undefined); - - await handler( - mockReq("uploads", { - query: { path: "/Volumes/catalog/schema/uploads/dir/file.txt" }, - }), - res, - ); - - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ success: true }), - ); - expect(mockCacheInstance.generateKey).toHaveBeenCalled(); - expect(mockCacheInstance.delete).toHaveBeenCalled(); - }); - - test("delete without path returns 400", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "delete", ""); - const res = mockRes(); - - await handler(mockReq("uploads", { query: {} }), res); - - expect(res.status).toHaveBeenCalledWith(400); - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ error: "path is required" }), - ); - }); - - test("delete that throws ApiError returns proper status", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "delete", ""); - const res = mockRes(); - - mockClient.files.delete.mockRejectedValue( - new MockApiError("Not found", 404), - ); - - await handler( - mockReq("uploads", { - query: { path: "/Volumes/catalog/schema/uploads/missing.txt" }, - }), - res, - ); - - // SDK errors go through execute() which returns {ok: false, status: 404} - // then _sendStatusError is called with STATUS_CODES[404] = "Not Found" - expect(res.status).toHaveBeenCalledWith(404); - }); - }); - - // ────────────────────────────────────────────────────────────────────── - // 7. Mkdir endpoint: cache invalidation - // ────────────────────────────────────────────────────────────────────── - describe("Mkdir cache invalidation", () => { - test("successful mkdir invalidates list cache", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "post", "/mkdir"); - const res = mockRes(); - - mockClient.files.createDirectory.mockResolvedValue(undefined); - - await handler( - mockReq("uploads", { - body: { path: "/Volumes/catalog/schema/uploads/newdir" }, - }), - res, - ); - - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ success: true }), - ); - expect(mockCacheInstance.generateKey).toHaveBeenCalled(); - expect(mockCacheInstance.delete).toHaveBeenCalled(); - }); - - test("mkdir without path returns 400", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "post", "/mkdir"); - const res = mockRes(); - - await handler(mockReq("uploads", { body: {} }), res); - - expect(res.status).toHaveBeenCalledWith(400); - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ error: "path is required" }), - ); - }); - - test("mkdir that throws ApiError 409 is handled via execute", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "post", "/mkdir"); - const res = mockRes(); - - mockClient.files.createDirectory.mockRejectedValue( - new MockApiError("Conflict", 409), - ); - - await handler( - mockReq("uploads", { - body: { path: "/Volumes/catalog/schema/uploads/existing" }, - }), - res, - ); - - // SDK errors go through execute() -> _sendStatusError with status 409 - expect(res.status).toHaveBeenCalledWith(409); - }); - }); - - // ────────────────────────────────────────────────────────────────────── - // 8. Shutdown: trackWrite waits for in-flight writes, deadline timeout - // ────────────────────────────────────────────────────────────────────── - describe("Shutdown and trackWrite", () => { - beforeEach(() => { - vi.useFakeTimers(); - }); - - afterEach(() => { - vi.useRealTimers(); - }); - - test("shutdown waits for in-flight writes to complete", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - - // Simulate an in-flight write - (plugin as any).inflightWrites = 1; - - const shutdownPromise = plugin.shutdown(); - - // After 500ms the shutdown loop should still be waiting - await vi.advanceTimersByTimeAsync(500); - - // Simulate the write completing - (plugin as any).inflightWrites = 0; - - await vi.advanceTimersByTimeAsync(500); - await shutdownPromise; - - // Shutdown should have completed - expect((plugin as any).inflightWrites).toBe(0); - }); - - test("shutdown times out after 10 seconds with pending writes", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const abortAllSpy = vi.spyOn((plugin as any).streamManager, "abortAll"); - - // Simulate an in-flight write that never completes - (plugin as any).inflightWrites = 2; - - const shutdownPromise = plugin.shutdown(); - - // Advance past the 10-second deadline - await vi.advanceTimersByTimeAsync(11_000); - await shutdownPromise; - - // Should still call abortAll even after timeout - expect(abortAllSpy).toHaveBeenCalled(); - // inflightWrites remains > 0 since the writes never completed - expect((plugin as any).inflightWrites).toBe(2); - }); - - test("shutdown completes immediately when no in-flight writes", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const abortAllSpy = vi.spyOn((plugin as any).streamManager, "abortAll"); - - (plugin as any).inflightWrites = 0; - - const shutdownPromise = plugin.shutdown(); - await vi.advanceTimersByTimeAsync(0); - await shutdownPromise; - - expect(abortAllSpy).toHaveBeenCalled(); - }); - - test("trackWrite increments and decrements inflightWrites correctly", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - expect((plugin as any).inflightWrites).toBe(0); - - let resolveInner!: (value: string) => void; - const innerPromise = new Promise((r) => { - resolveInner = r; - }); - - const trackPromise = (plugin as any).trackWrite(() => innerPromise); - - // While the tracked fn is running, inflightWrites should be 1 - expect((plugin as any).inflightWrites).toBe(1); - - resolveInner("done"); - const result = await trackPromise; - - expect(result).toBe("done"); - expect((plugin as any).inflightWrites).toBe(0); - }); - - test("trackWrite decrements inflightWrites even on rejection", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - - const trackPromise = (plugin as any).trackWrite(() => - Promise.reject(new Error("write failed")), - ); - - await expect(trackPromise).rejects.toThrow("write failed"); - expect((plugin as any).inflightWrites).toBe(0); - }); - }); - - // ────────────────────────────────────────────────────────────────────── - // 9. Volume discovery: merging explicit config with env vars - // ────────────────────────────────────────────────────────────────────── - describe("Volume discovery merging", () => { - test("explicit config takes priority over env vars", () => { - const volumes = FilesPlugin.discoverVolumes({ - volumes: { - uploads: { maxUploadSize: 42 }, - custom: { maxUploadSize: 99 }, - }, - }); - - // uploads: explicit config wins (maxUploadSize: 42), not {} from env - expect(volumes.uploads).toEqual({ maxUploadSize: 42 }); - // exports: discovered from env with default empty config - expect(volumes.exports).toEqual({}); - // custom: explicit only, no env var - expect(volumes.custom).toEqual({ maxUploadSize: 99 }); - }); - - test("discovered volumes get empty config objects", () => { - process.env.DATABRICKS_VOLUME_DATA = "/Volumes/catalog/schema/data"; - - try { - const volumes = FilesPlugin.discoverVolumes({}); - expect(volumes.data).toEqual({}); - } finally { - delete process.env.DATABRICKS_VOLUME_DATA; - } - }); - - test("explicit volumes without env vars still appear", () => { - delete process.env.DATABRICKS_VOLUME_UPLOADS; - delete process.env.DATABRICKS_VOLUME_EXPORTS; - - const volumes = FilesPlugin.discoverVolumes({ - volumes: { - private: { maxUploadSize: 10 }, - }, - }); - - expect(Object.keys(volumes)).toEqual(["private"]); - expect(volumes.private).toEqual({ maxUploadSize: 10 }); - }); - - test("env var volume is not added when explicit config has the same key", () => { - process.env.DATABRICKS_VOLUME_SPECIAL = "/Volumes/catalog/schema/special"; - - try { - const volumes = FilesPlugin.discoverVolumes({ - volumes: { - special: { maxUploadSize: 500 }, - }, - }); - - // Explicit wins; should not be overwritten with {} - expect(volumes.special).toEqual({ maxUploadSize: 500 }); - } finally { - delete process.env.DATABRICKS_VOLUME_SPECIAL; - } - }); - }); - - // ────────────────────────────────────────────────────────────────────── - // 10. Path validation edge cases - // ────────────────────────────────────────────────────────────────────── - describe("Path validation", () => { - test("path with null bytes returns 400", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/read"); - const res = mockRes(); - - await handler( - mockReq("uploads", { query: { path: "/Volumes/test/\0evil" } }), - res, - ); - - expect(res.status).toHaveBeenCalledWith(400); - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ - error: "path must not contain null bytes", - }), - ); - }); - - test("path exceeding 4096 characters returns 400", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/read"); - const res = mockRes(); - - const longPath = "/Volumes/test/" + "a".repeat(4100); - - await handler(mockReq("uploads", { query: { path: longPath } }), res); - - expect(res.status).toHaveBeenCalledWith(400); - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ - error: expect.stringContaining("exceeds maximum length"), - }), - ); - }); - - test("exists without path returns 400", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/exists"); - const res = mockRes(); - - await handler(mockReq("uploads", { query: {} }), res); - - expect(res.status).toHaveBeenCalledWith(400); - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ - error: "path is required", - plugin: "files", - }), - ); - }); - - test("metadata without path returns 400", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/metadata"); - const res = mockRes(); - - await handler(mockReq("uploads", { query: {} }), res); - - expect(res.status).toHaveBeenCalledWith(400); - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ - error: "path is required", - plugin: "files", - }), - ); - }); - - test("preview without path returns 400", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "get", "/preview"); - const res = mockRes(); - - await handler(mockReq("uploads", { query: {} }), res); - - expect(res.status).toHaveBeenCalledWith(400); - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ - error: "path is required", - plugin: "files", - }), - ); - }); - - test("upload without path returns 400", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "post", "/upload"); - const res = mockRes(); - - const req = mockUploadReq("uploads", [Buffer.from("data")], { - query: {}, - }); - - await handler(req, res); - - expect(res.status).toHaveBeenCalledWith(400); - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ - error: "path is required", - plugin: "files", - }), - ); - }); - - test("delete with null bytes in path returns 400", async () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const handler = getRouteHandler(plugin, "delete", ""); - const res = mockRes(); - - await handler( - mockReq("uploads", { query: { path: "/Volumes/test/\0evil" } }), - res, - ); - - expect(res.status).toHaveBeenCalledWith(400); - expect(res.json).toHaveBeenCalledWith( - expect.objectContaining({ - error: "path must not contain null bytes", - }), - ); - }); - }); - - // ────────────────────────────────────────────────────────────────────── - // 11. clientConfig returns volume keys - // ────────────────────────────────────────────────────────────────────── - describe("clientConfig", () => { - test("returns configured volume keys", () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const config = plugin.clientConfig(); - - expect(config).toEqual({ volumes: ["uploads", "exports"] }); - }); - - test("returns empty volumes when none configured and no env vars", () => { - delete process.env.DATABRICKS_VOLUME_UPLOADS; - delete process.env.DATABRICKS_VOLUME_EXPORTS; - - const plugin = new FilesPlugin({ volumes: {} }); - const config = plugin.clientConfig(); - - expect(config).toEqual({ volumes: [] }); - }); - }); - - // ────────────────────────────────────────────────────────────────────── - // 12. _sendStatusError uses HTTP status code text - // ────────────────────────────────────────────────────────────────────── - describe("_sendStatusError", () => { - test("sends standard HTTP status text for known codes", () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const res = mockRes(); - - (plugin as any)._sendStatusError(res, 404); - - expect(res.status).toHaveBeenCalledWith(404); - expect(res.json).toHaveBeenCalledWith({ - error: "Not Found", - plugin: "files", - }); - }); - - test("sends 'Unknown Error' for non-standard status codes", () => { - const plugin = new FilesPlugin(VOLUMES_CONFIG); - const res = mockRes(); - - (plugin as any)._sendStatusError(res, 999); - - expect(res.status).toHaveBeenCalledWith(999); - expect(res.json).toHaveBeenCalledWith({ - error: "Unknown Error", - plugin: "files", - }); - }); - }); -}); From e1e9017dea66adc9b3e046c147408c8c064f3f4c Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Tue, 28 Apr 2026 11:16:07 +0000 Subject: [PATCH 13/17] fix: handle ARROW_STREAM attachment in type generator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some serverless warehouses reject JSON_ARRAY + INLINE for DESCRIBE QUERY and return ARROW_STREAM by default. The previous behavior just removed the broken fallback, which meant typegen produced `unknown` types for those warehouses' queries. This restores the fallback (retry without explicit format if JSON_ARRAY is rejected) and teaches `convertToQueryType` to decode an inline base64 Arrow IPC attachment when `data_array` is empty. The DESCRIBE QUERY result is itself a table with rows shaped (col_name, data_type, comment), so the decode reads `table.toArray().map(r => r.toJSON())` rather than `table.schema.fields` — reading the schema would yield bogus types (every query would come out shaped like the metadata columns). Re-adds apache-arrow as an appkit dependency (only the typegen uses it; the runtime SDK does not). Tests cover: schema extraction from data rows, lowercase type normalization, data_array taking precedence when both are present, and graceful degradation on malformed attachments. Supersedes #316. Co-authored-by: Isaac Signed-off-by: James Broadhead --- packages/appkit/package.json | 1 + .../src/type-generator/query-registry.ts | 94 +++++++++++++++--- .../tests/query-registry.test.ts | 96 +++++++++++++++++++ packages/appkit/src/type-generator/types.ts | 2 + pnpm-lock.yaml | 3 + 5 files changed, 184 insertions(+), 12 deletions(-) diff --git a/packages/appkit/package.json b/packages/appkit/package.json index 3b57014c0..04232f88b 100644 --- a/packages/appkit/package.json +++ b/packages/appkit/package.json @@ -69,6 +69,7 @@ "@opentelemetry/sdk-trace-base": "2.6.0", "@opentelemetry/semantic-conventions": "1.38.0", "@types/semver": "7.7.1", + "apache-arrow": "21.1.0", "dotenv": "16.6.1", "express": "4.22.0", "obug": "2.1.1", diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts index a950a4a52..ea86a8be5 100644 --- a/packages/appkit/src/type-generator/query-registry.ts +++ b/packages/appkit/src/type-generator/query-registry.ts @@ -1,6 +1,7 @@ import fs from "node:fs/promises"; import path from "node:path"; import { WorkspaceClient } from "@databricks/sdk-experimental"; +import { tableFromIPC } from "apache-arrow"; import pc from "picocolors"; import { createLogger } from "../logging/logger"; import { CACHE_VERSION, hashSQL, loadCache, saveCache } from "./cache"; @@ -129,18 +130,69 @@ function formatParametersType(sql: string): string { : "Record"; } +/** + * Decode a base64 Arrow IPC attachment from a DESCRIBE QUERY response and + * extract column metadata. Returns the same shape as rows parsed from the + * legacy data_array path. + * + * IMPORTANT: a DESCRIBE QUERY response is itself a result *table* with rows + * shaped like `(col_name, data_type, comment)` describing the user query's + * output schema. We must read those rows — NOT `table.schema.fields`, which + * would describe DESCRIBE QUERY's own output (`col_name`, `data_type`, + * `comment`) and yield bogus types for every query. + */ +function columnsFromArrowAttachment( + attachment: string, +): Array<{ name: string; type_name: string; comment: string | undefined }> { + const buf = Buffer.from(attachment, "base64"); + const table = tableFromIPC(buf); + return table.toArray().map((row) => { + const obj = row.toJSON() as { + col_name?: unknown; + data_type?: unknown; + comment?: unknown; + }; + return { + name: typeof obj.col_name === "string" ? obj.col_name : "", + type_name: + typeof obj.data_type === "string" + ? obj.data_type.toUpperCase() + : "STRING", + comment: + typeof obj.comment === "string" && obj.comment !== "" + ? obj.comment + : undefined, + }; + }); +} + export function convertToQueryType( result: DatabricksStatementExecutionResponse, sql: string, queryName: string, ): { type: string; hasResults: boolean } { const dataRows = result.result?.data_array || []; - const columns = dataRows.map((row) => ({ + let columns = dataRows.map((row) => ({ name: row[0] || "", type_name: row[1]?.toUpperCase() || "STRING", comment: row[2] || undefined, })); + // Fallback: serverless warehouses return ARROW_STREAM format with an inline + // base64 attachment instead of data_array. Decode the Arrow IPC rows (the + // DESCRIBE QUERY result table) to extract column names and types. + if (columns.length === 0 && result.result?.attachment) { + logger.debug("data_array empty, decoding Arrow IPC attachment for schema"); + try { + columns = columnsFromArrowAttachment(result.result.attachment); + } catch (err) { + logger.warn( + "Failed to decode Arrow IPC attachment: %s", + err instanceof Error ? err.message : String(err), + ); + } + } + const paramsType = formatParametersType(sql); // generate result fields with JSDoc @@ -386,16 +438,33 @@ export async function generateQueriesFromDescribe( sqlHash, cleanedSql, }: (typeof uncachedQueries)[number]): Promise => { - // Always request JSON_ARRAY + INLINE so the downstream caller can parse - // `data_array` predictably. If the warehouse rejects this combination, - // let the error propagate — the surrounding `Promise.allSettled` will - // generate `unknown` types via `generateUnknownResultQuery`. - const result = (await client.statementExecution.executeStatement({ - statement: `DESCRIBE QUERY ${cleanedSql}`, - warehouse_id: warehouseId, - format: "JSON_ARRAY", - disposition: "INLINE", - })) as DatabricksStatementExecutionResponse; + // Prefer JSON_ARRAY + INLINE so `data_array` parsing works directly. + // Some serverless warehouses reject this combination — fall back to the + // warehouse default (typically ARROW_STREAM + INLINE), and let + // `convertToQueryType` decode the inline attachment. + let result: DatabricksStatementExecutionResponse; + try { + result = (await client.statementExecution.executeStatement({ + statement: `DESCRIBE QUERY ${cleanedSql}`, + warehouse_id: warehouseId, + format: "JSON_ARRAY", + disposition: "INLINE", + })) as DatabricksStatementExecutionResponse; + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + if (msg.includes("ARROW_STREAM") || msg.includes("JSON_ARRAY")) { + logger.debug( + "Warehouse rejected JSON_ARRAY for %s, retrying with default format", + queryName, + ); + result = (await client.statementExecution.executeStatement({ + statement: `DESCRIBE QUERY ${cleanedSql}`, + warehouse_id: warehouseId, + })) as DatabricksStatementExecutionResponse; + } else { + throw err; + } + } completed++; spinner.update( @@ -403,10 +472,11 @@ export async function generateQueriesFromDescribe( ); logger.debug( - "DESCRIBE result for %s: state=%s, rows=%d", + "DESCRIBE result for %s: state=%s, rows=%d, hasAttachment=%s", queryName, result.status.state, result.result?.data_array?.length ?? 0, + !!result.result?.attachment, ); if (result.status.state === "FAILED") { diff --git a/packages/appkit/src/type-generator/tests/query-registry.test.ts b/packages/appkit/src/type-generator/tests/query-registry.test.ts index 8d46f98e9..d3c5be55e 100644 --- a/packages/appkit/src/type-generator/tests/query-registry.test.ts +++ b/packages/appkit/src/type-generator/tests/query-registry.test.ts @@ -1,3 +1,4 @@ +import { Table, tableToIPC, vectorFromArray } from "apache-arrow"; import { describe, expect, test } from "vitest"; import { convertToQueryType, @@ -11,6 +12,20 @@ import { } from "../query-registry"; import type { DatabricksStatementExecutionResponse } from "../types"; +// Build a base64 Arrow IPC payload that mimics a DESCRIBE QUERY response — +// a result *table* with columns (col_name, data_type, comment) describing +// the user query's output schema. +function describeQueryAttachment( + rows: Array<{ col_name: string; data_type: string; comment: string | null }>, +): string { + const table = new Table({ + col_name: vectorFromArray(rows.map((r) => r.col_name)), + data_type: vectorFromArray(rows.map((r) => r.data_type)), + comment: vectorFromArray(rows.map((r) => r.comment ?? "")), + }); + return Buffer.from(tableToIPC(table, "stream")).toString("base64"); +} + describe("normalizeTypeName", () => { test("returns simple types unchanged", () => { expect(normalizeTypeName("STRING")).toBe("STRING"); @@ -346,6 +361,87 @@ SELECT * FROM users WHERE date = :startDate AND count = :count AND name = :name` ); expect(hasResults).toBe(false); }); + + describe("ARROW_STREAM attachment fallback (serverless warehouses)", () => { + test("decodes column metadata from Arrow IPC data rows, not schema fields", () => { + // Critical regression test: it would be a bug to read + // `table.schema.fields` here, which would generate types like + // { col_name: string; data_type: string; comment: string } for every + // query (those are DESCRIBE QUERY's own output columns). We must read + // the data rows. + const attachment = describeQueryAttachment([ + { col_name: "user_id", data_type: "BIGINT", comment: null }, + { col_name: "name", data_type: "STRING", comment: "display name" }, + { col_name: "active", data_type: "BOOLEAN", comment: null }, + ]); + const response: DatabricksStatementExecutionResponse = { + statement_id: "test-arrow", + status: { state: "SUCCEEDED" }, + result: { attachment }, + }; + + const { type, hasResults } = convertToQueryType( + response, + "SELECT user_id, name, active FROM users", + "users", + ); + + expect(hasResults).toBe(true); + // Real query columns appear in the generated type: + expect(type).toContain("user_id: number"); + expect(type).toContain("name: string"); + expect(type).toContain("active: boolean"); + // Column comments survive: + expect(type).toContain("/** display name"); + // The DESCRIBE QUERY metadata column names must NOT leak as user types: + expect(type).not.toContain("col_name: string"); + expect(type).not.toContain("data_type: string"); + }); + + test("normalizes lowercase data_type values to uppercase", () => { + const attachment = describeQueryAttachment([ + { col_name: "id", data_type: "int", comment: null }, + ]); + const response: DatabricksStatementExecutionResponse = { + statement_id: "test-arrow", + status: { state: "SUCCEEDED" }, + result: { attachment }, + }; + + const { type } = convertToQueryType(response, "SELECT 1", "test"); + expect(type).toContain("@sqlType INT"); + expect(type).toContain("id: number"); + }); + + test("prefers data_array over attachment when both are present", () => { + const attachment = describeQueryAttachment([ + { col_name: "from_arrow", data_type: "STRING", comment: null }, + ]); + const response: DatabricksStatementExecutionResponse = { + statement_id: "test-both", + status: { state: "SUCCEEDED" }, + result: { + data_array: [["from_data_array", "INT", null]], + attachment, + }, + }; + + const { type } = convertToQueryType(response, "SELECT 1", "test"); + expect(type).toContain("from_data_array: number"); + expect(type).not.toContain("from_arrow"); + }); + + test("logs a warning and yields no columns on malformed attachment", () => { + const response: DatabricksStatementExecutionResponse = { + statement_id: "test-bad", + status: { state: "SUCCEEDED" }, + result: { attachment: "not-valid-arrow-ipc" }, + }; + + const { hasResults } = convertToQueryType(response, "SELECT 1", "test"); + expect(hasResults).toBe(false); + }); + }); }); describe("inferParameterTypes", () => { diff --git a/packages/appkit/src/type-generator/types.ts b/packages/appkit/src/type-generator/types.ts index 5af43591a..9a591f512 100644 --- a/packages/appkit/src/type-generator/types.ts +++ b/packages/appkit/src/type-generator/types.ts @@ -12,6 +12,8 @@ export interface DatabricksStatementExecutionResponse { }; result?: { data_array?: (string | null)[][]; + /** Base64-encoded Arrow IPC bytes (returned by serverless warehouses using ARROW_STREAM format) */ + attachment?: string; }; } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 54512501d..46096f433 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -299,6 +299,9 @@ importers: '@types/semver': specifier: 7.7.1 version: 7.7.1 + apache-arrow: + specifier: 21.1.0 + version: 21.1.0 dotenv: specifier: 16.6.1 version: 16.6.1 From 003a309a4f2e9b9deb91759cf24bed85929f29c9 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Tue, 28 Apr 2026 11:38:14 +0000 Subject: [PATCH 14/17] fix: address ACE iter-2 review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tightens correctness gaps surfaced by the second ACE review pass (GPT 5.4 + Gemini 3.1 Pro + Claude harsh-reviewer): - _isInlineArrowUnsupported now requires both INLINE and ARROW_STREAM in the message regardless of whether a structured error_code is present. The previous shape would fall back on any INVALID_PARAMETER_VALUE that mentioned just one keyword. - useAnalyticsQuery validates that arrow_inline messages carry a non-empty string attachment before invoking atob, so a malformed payload surfaces a clear error instead of a confusing decode crash. - ARROW_STREAM cache TTL is capped at 600s rather than disabling the cache outright. Pre-signed EXTERNAL_LINKS URLs typically expire in ~15min, so 10min is a safe upper bound that still preserves caching for INLINE attachment responses. - Type generator's JSON_ARRAY-rejection retry now (a) requires both JSON_ARRAY and a marker phrase before retrying, mirroring the analytics-plugin tightening, and (b) explicitly requests ARROW_STREAM + INLINE on the retry rather than letting the warehouse default kick in. Previously the warehouse default could return EXTERNAL_LINKS, in which case neither data_array nor attachment were populated and types silently degraded to `unknown`. - _validateArrowAttachment strips whitespace and accounts for `=` padding so the size check is exact rather than an upper bound. Also uses `Math.floor` so the cap matches the actual decoded byte count. Tests: - New: structured error_code path triggers fallback when both keywords are present. - New: regression — error mentioning only one of INLINE / ARROW_STREAM must not escalate to EXTERNAL_LINKS even if the retry interceptor attempts the query multiple times. - New: hook rejects arrow_inline with attachment values that are undefined / null / empty / non-string / object, never invoking atob or processArrowBuffer on bad input. Co-authored-by: Isaac Signed-off-by: James Broadhead --- .../__tests__/use-analytics-query.test.ts | 25 ++++++ .../src/react/hooks/use-analytics-query.ts | 11 +++ .../src/connectors/sql-warehouse/client.ts | 14 +++- .../appkit/src/plugins/analytics/analytics.ts | 47 ++++++----- .../plugins/analytics/tests/analytics.test.ts | 83 +++++++++++++++++++ .../src/type-generator/query-registry.ts | 19 +++-- 6 files changed, 173 insertions(+), 26 deletions(-) diff --git a/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts b/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts index d4bf51010..81159792a 100644 --- a/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts +++ b/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts @@ -77,6 +77,31 @@ describe("useAnalyticsQuery", () => { expect(result.current.loading).toBe(false); }); + test("rejects arrow_inline with missing/empty/non-string attachment without crashing atob", async () => { + const cases: Array = [undefined, null, "", 123, { foo: "bar" }]; + + for (const attachment of cases) { + mockProcessArrowBuffer.mockClear(); + const { result, unmount } = renderHook(() => + useAnalyticsQuery("q", null, { format: "ARROW_STREAM" }), + ); + + await lastConnectArgs.onMessage({ + data: JSON.stringify({ type: "arrow_inline", attachment }), + }); + + await waitFor(() => { + expect(result.current.error).toBe( + "Unable to load data, please try again", + ); + }); + // Critically: must NOT call processArrowBuffer (or atob) on the bad input. + expect(mockProcessArrowBuffer).not.toHaveBeenCalled(); + + unmount(); + } + }); + test("still handles type:result rows for JSON_ARRAY", async () => { const { result } = renderHook(() => useAnalyticsQuery("q", null, { format: "JSON_ARRAY" }), diff --git a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts index 3a3aa8789..1817699d8 100644 --- a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts +++ b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts @@ -163,6 +163,17 @@ export function useAnalyticsQuery< // success - Arrow format (inline: decode base64 IPC payload locally) if (parsed.type === "arrow_inline") { + if ( + typeof parsed.attachment !== "string" || + parsed.attachment.length === 0 + ) { + console.error( + "[useAnalyticsQuery] arrow_inline message missing attachment", + ); + setLoading(false); + setError("Unable to load data, please try again"); + return; + } try { const buffer = decodeBase64(parsed.attachment); const table = await ArrowClient.processArrowBuffer(buffer); diff --git a/packages/appkit/src/connectors/sql-warehouse/client.ts b/packages/appkit/src/connectors/sql-warehouse/client.ts index 050e5cd8a..df7f0178e 100644 --- a/packages/appkit/src/connectors/sql-warehouse/client.ts +++ b/packages/appkit/src/connectors/sql-warehouse/client.ts @@ -476,8 +476,18 @@ export class SQLWarehouseConnector { ) { // Cap the size to protect against unbounded inline payloads from // misbehaving warehouses. 64 MiB is well above the typical inline limit - // (~16 MiB) but bounds memory if a server returns a runaway response. - const decodedSize = Math.ceil((attachment.length * 3) / 4); + // (~25 MiB hard cap on the API) but bounds memory if a server returns + // a runaway response. + // + // Strip whitespace (rare but legal in base64) and account for trailing + // `=` padding so the byte count is exact rather than an upper bound. + const stripped = attachment.replace(/\s+/g, ""); + const padding = stripped.endsWith("==") + ? 2 + : stripped.endsWith("=") + ? 1 + : 0; + const decodedSize = Math.floor((stripped.length * 3) / 4) - padding; if (decodedSize > MAX_INLINE_ATTACHMENT_BYTES) { throw ExecutionError.statementFailed( `Inline Arrow attachment exceeds maximum size (${decodedSize} > ${MAX_INLINE_ATTACHMENT_BYTES} bytes)`, diff --git a/packages/appkit/src/plugins/analytics/analytics.ts b/packages/appkit/src/plugins/analytics/analytics.ts index b26dd4c01..56384bf86 100644 --- a/packages/appkit/src/plugins/analytics/analytics.ts +++ b/packages/appkit/src/plugins/analytics/analytics.ts @@ -162,22 +162,26 @@ export class AnalyticsPlugin extends Plugin { const hashedQuery = this.queryProcessor.hashQuery(query); // ARROW_STREAM may resolve to EXTERNAL_LINKS, which returns pre-signed URLs - // that typically expire well before queryDefaults.cache.ttl. Disable cache - // for ARROW_STREAM to avoid handing out dead URLs from cache. - const cacheConfig = + // that typically expire ~15 minutes after issue. Cap the cache TTL well + // under that for ARROW_STREAM so we never hand out dead URLs from cache, + // while still benefiting from caching INLINE attachment responses (and + // EXTERNAL_LINKS responses inside their valid window). + const cacheTtl = format === "ARROW_STREAM" - ? { ...queryDefaults.cache, enabled: false } - : { - ...queryDefaults.cache, - cacheKey: [ - "analytics:query", - query_key, - JSON.stringify(parameters), - format, - hashedQuery, - executorKey, - ], - }; + ? Math.min(queryDefaults.cache?.ttl ?? 600, 600) + : queryDefaults.cache?.ttl; + const cacheConfig = { + ...queryDefaults.cache, + ttl: cacheTtl, + cacheKey: [ + "analytics:query", + query_key, + JSON.stringify(parameters), + format, + hashedQuery, + executorKey, + ], + }; const defaultConfig: PluginExecuteConfig = { ...queryDefaults, @@ -358,18 +362,23 @@ export class AnalyticsPlugin extends Plugin { function _isInlineArrowUnsupported(err: unknown): boolean { const msg = err instanceof Error ? err.message : String(err); + // Both branches require both INLINE and ARROW_STREAM to appear in the + // message — without that pairing we cannot distinguish a format-rejection + // from an unrelated SQL/permission error that happens to mention one + // keyword (e.g. a column named "INLINE_USERS"). + if (!msg.includes("INLINE") || !msg.includes("ARROW_STREAM")) { + return false; + } + const errorCodeMatch = msg.match(/"error_code"\s*:\s*"([^"]+)"/); const errorCode = errorCodeMatch?.[1]; if ( errorCode === "INVALID_PARAMETER_VALUE" || errorCode === "NOT_IMPLEMENTED" ) { - return msg.includes("INLINE") || msg.includes("ARROW_STREAM"); + return true; } - if (!msg.includes("INLINE") || !msg.includes("ARROW_STREAM")) { - return false; - } return ( msg.includes("not supported") || msg.includes("INVALID_PARAMETER_VALUE") || diff --git a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts index 643ee2ca9..a8c7821c1 100644 --- a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts +++ b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts @@ -731,6 +731,89 @@ describe("Analytics Plugin", () => { }); }); + test("/query/:query_key falls back when error message carries a structured INVALID_PARAMETER_VALUE error_code", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + // Wrapped JSON error like the SDK surfaces from a `Bad Request` HTTP + // response. Both INLINE and ARROW_STREAM appear, plus the structured code. + const wrappedJsonError = new Error( + 'Response from server (Bad Request) {"error_code":"INVALID_PARAMETER_VALUE","message":"ARROW_STREAM is not supported with INLINE disposition on this warehouse"}', + ); + const executeMock = vi + .fn() + .mockRejectedValueOnce(wrappedJsonError) + .mockResolvedValueOnce({ + result: { statement_id: "stmt-1", status: { state: "SUCCEEDED" } }, + }); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW_STREAM" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // Both attempts ran: INLINE (rejected) then EXTERNAL_LINKS (succeeded). + expect(executeMock).toHaveBeenCalledTimes(2); + expect(executeMock.mock.calls[1][1]).toMatchObject({ + disposition: "EXTERNAL_LINKS", + format: "ARROW_STREAM", + }); + }); + + test("/query/:query_key does NOT fall back when only one of INLINE/ARROW_STREAM appears in the error", async () => { + const plugin = new AnalyticsPlugin(config); + const { router, getHandler } = createMockRouter(); + + (plugin as any).app.getAppQuery = vi.fn().mockResolvedValue({ + query: "SELECT * FROM test", + isAsUser: false, + }); + + // Realistic non-format error that mentions just one of the keywords — + // e.g. an unrelated INVALID_PARAMETER_VALUE about a different param. + const executeMock = vi + .fn() + .mockRejectedValue( + new Error( + 'Response from server (Bad Request) {"error_code":"INVALID_PARAMETER_VALUE","message":"INLINE is not a valid value for parameter `mode`"}', + ), + ); + (plugin as any).SQLClient.executeStatement = executeMock; + + plugin.injectRoutes(router); + + const handler = getHandler("POST", "/query/:query_key"); + const mockReq = createMockRequest({ + params: { query_key: "test_query" }, + body: { parameters: {}, format: "ARROW_STREAM" }, + }); + const mockRes = createMockResponse(); + + await handler(mockReq, mockRes); + + // The retry interceptor may attempt the query multiple times, but the + // analytics plugin must never escalate to EXTERNAL_LINKS for an error + // that doesn't actually indicate a format/disposition rejection. + for (const call of executeMock.mock.calls) { + expect(call[1]).toMatchObject({ + disposition: "INLINE", + format: "ARROW_STREAM", + }); + } + }); + test("/query/:query_key should not fall back for non-format errors", async () => { const plugin = new AnalyticsPlugin(config); const { router, getHandler } = createMockRouter(); diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts index ea86a8be5..63c531d15 100644 --- a/packages/appkit/src/type-generator/query-registry.ts +++ b/packages/appkit/src/type-generator/query-registry.ts @@ -439,9 +439,11 @@ export async function generateQueriesFromDescribe( cleanedSql, }: (typeof uncachedQueries)[number]): Promise => { // Prefer JSON_ARRAY + INLINE so `data_array` parsing works directly. - // Some serverless warehouses reject this combination — fall back to the - // warehouse default (typically ARROW_STREAM + INLINE), and let - // `convertToQueryType` decode the inline attachment. + // Some serverless warehouses reject this combination — fall back to + // ARROW_STREAM + INLINE (still inline, just a different format) and + // let `convertToQueryType` decode the inline attachment. Forcing + // INLINE on the retry avoids EXTERNAL_LINKS, which would silently + // produce empty `data_array` and degrade types to `unknown`. let result: DatabricksStatementExecutionResponse; try { result = (await client.statementExecution.executeStatement({ @@ -452,14 +454,21 @@ export async function generateQueriesFromDescribe( })) as DatabricksStatementExecutionResponse; } catch (err: unknown) { const msg = err instanceof Error ? err.message : String(err); - if (msg.includes("ARROW_STREAM") || msg.includes("JSON_ARRAY")) { + const looksLikeFormatRejection = + msg.includes("JSON_ARRAY") && + (msg.includes("not supported") || + msg.includes("INVALID_PARAMETER_VALUE") || + msg.includes("NOT_IMPLEMENTED")); + if (looksLikeFormatRejection) { logger.debug( - "Warehouse rejected JSON_ARRAY for %s, retrying with default format", + "Warehouse rejected JSON_ARRAY+INLINE for %s, retrying with ARROW_STREAM+INLINE", queryName, ); result = (await client.statementExecution.executeStatement({ statement: `DESCRIBE QUERY ${cleanedSql}`, warehouse_id: warehouseId, + format: "ARROW_STREAM", + disposition: "INLINE", })) as DatabricksStatementExecutionResponse; } else { throw err; From cf50679c513a1251f9f0356ebbc7038d918167f8 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Tue, 28 Apr 2026 16:25:48 +0000 Subject: [PATCH 15/17] fix: synthesize empty Arrow IPC for empty ARROW_STREAM responses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hook is typed `TypedArrowTable` for ARROW_STREAM, but empty results (zero rows / no inline attachment / no external_links) previously fell through to `{ type: "result", ...result }` with no `data` field. Direct useAnalyticsQuery callers expecting a Table would crash on `.numRows` or `.getChild()`. This commit makes ARROW_STREAM truly contract-consistent: when the response has a manifest schema but no row payload, the connector synthesizes a zero-row Arrow IPC stream from the schema and stashes it in `result.attachment`. The existing arrow_inline path emits it to the client, which decodes it as a real Arrow Table — same shape as a non-empty result, just with `numRows === 0` and empty vectors per column. A new module `connectors/sql-warehouse/arrow-schema.ts` parses Databricks SQL `type_text` values into Apache Arrow `DataType`s. It is fully exhaustive: - Scalars (STRING, BIGINT, DECIMAL, TIMESTAMP, BOOLEAN, …) - Parameterized: DECIMAL(p,s), VARCHAR(n), CHAR(n) - Nested: ARRAY, MAP, STRUCT - INTERVAL year-month and day-time variants - Backtick-quoted struct field names (incl. `` `` `` escapes) - Recursive nesting (verified up to MAP>>) - NOT NULL annotations on struct fields Unknown / unparseable types degrade to Utf8 rather than throwing, so future Databricks types don't crash empty-result handling. Tests: 72 cases covering every scalar, parameterization edge case, deeply nested combinations, struct field annotations / comments / backtick escapes, INTERVAL variants, and IPC round-trip. 7 connector tests covering attachment synthesis + the cases where it must NOT synthesize (external_links present, schema absent). Co-authored-by: Isaac Signed-off-by: James Broadhead --- .../connectors/sql-warehouse/arrow-schema.ts | 441 +++++++++++++++ .../src/connectors/sql-warehouse/client.ts | 21 +- .../sql-warehouse/tests/arrow-schema.test.ts | 514 ++++++++++++++++++ .../sql-warehouse/tests/client.test.ts | 78 +++ 4 files changed, 1051 insertions(+), 3 deletions(-) create mode 100644 packages/appkit/src/connectors/sql-warehouse/arrow-schema.ts create mode 100644 packages/appkit/src/connectors/sql-warehouse/tests/arrow-schema.test.ts diff --git a/packages/appkit/src/connectors/sql-warehouse/arrow-schema.ts b/packages/appkit/src/connectors/sql-warehouse/arrow-schema.ts new file mode 100644 index 000000000..17d099e37 --- /dev/null +++ b/packages/appkit/src/connectors/sql-warehouse/arrow-schema.ts @@ -0,0 +1,441 @@ +import { + Binary, + Bool, + type DataType, + DateDay, + Decimal, + DurationMicrosecond, + Field, + Float32, + Float64, + Int8, + Int16, + Int32, + Int64, + IntervalYearMonth, + List, + Map_, + Null, + Schema, + Struct, + Table, + TimestampMicrosecond, + tableToIPC, + Utf8, +} from "apache-arrow"; + +/** + * Parse a Databricks SQL type text (the value returned by the Statement + * Execution API in `ColumnInfo.type_text`) into an Apache Arrow DataType. + * + * Supports: + * - All scalar types (STRING, INT, BIGINT, DECIMAL, TIMESTAMP, etc.) + * - Parameterized scalars: DECIMAL(p,s), VARCHAR(n), CHAR(n) + * - Nested types: ARRAY, MAP, STRUCT + * - INTERVAL year-month and day-time variants + * - Backtick-quoted struct field names with embedded `` `` `` escapes + * + * Unknown or unparseable types fall back to Utf8 — empty-Table consumers + * still see a column with the right name; only the inner type is degraded. + */ +export function parseDatabricksType(typeText: string): DataType { + const parser = new TypeParser(typeText); + const result = parser.parseType(); + parser.expectEnd(); + return result; +} + +/** + * Build an empty Arrow IPC stream (base64-encoded) matching the column schema + * returned by the warehouse. Used so ARROW_STREAM responses with no rows still + * deliver a real Arrow Table to the client, preserving the hook's typed + * contract. + */ +export function buildEmptyArrowIPCBase64( + columns: Array<{ + name?: string; + type_text?: string; + type_name?: string; + }>, +): string { + const fields = columns.map((col, index) => { + const typeText = col.type_text ?? col.type_name ?? "STRING"; + let dataType: DataType; + try { + dataType = parseDatabricksType(typeText); + } catch { + dataType = new Utf8(); + } + const name = col.name && col.name.length > 0 ? col.name : `column_${index}`; + return new Field(name, dataType, true); + }); + const schema = new Schema(fields); + const table = new Table(schema); + const ipc = tableToIPC(table, "stream"); + return Buffer.from(ipc).toString("base64"); +} + +// ============================================================================ +// Recursive-descent parser +// ============================================================================ + +class TypeParser { + private readonly input: string; + private pos = 0; + + constructor(input: string) { + this.input = input; + } + + parseType(): DataType { + this.skipWs(); + + let name: string; + if (this.peek() === "`") { + name = this.consumeBacktickIdent(); + } else { + name = this.consumeIdent(); + } + const upper = name.toUpperCase(); + + this.skipWs(); + + if (upper === "INTERVAL") { + return this.parseInterval(); + } + + if (this.peek() === "(") { + this.consume("("); + const args = this.parseNumberArgs(); + this.consume(")"); + this.skipWs(); + return this.makeParameterized(upper, args); + } + + if (this.peek() === "<") { + this.consume("<"); + const result = this.makeGeneric(upper); + this.skipWs(); + this.consume(">"); + return result; + } + + return this.makeScalar(upper); + } + + expectEnd(): void { + this.skipWs(); + if (this.pos < this.input.length) { + throw new Error( + `Unexpected trailing input at position ${this.pos}: "${this.input.slice(this.pos)}"`, + ); + } + } + + // ─── Type constructors ─────────────────────────────────── + + private makeScalar(upper: string): DataType { + switch (upper) { + case "STRING": + case "VARIANT": + return new Utf8(); + case "VARCHAR": + case "CHAR": + return new Utf8(); + case "BINARY": + case "GEOGRAPHY": + case "GEOMETRY": + return new Binary(); + case "BOOLEAN": + case "BOOL": + return new Bool(); + case "TINYINT": + case "BYTE": + return new Int8(); + case "SMALLINT": + case "SHORT": + return new Int16(); + case "INT": + case "INTEGER": + return new Int32(); + case "BIGINT": + case "LONG": + return new Int64(); + case "FLOAT": + case "REAL": + return new Float32(); + case "DOUBLE": + return new Float64(); + case "DECIMAL": + case "NUMERIC": + case "DEC": + return new Decimal(0, 10, 128); + case "DATE": + return new DateDay(); + case "TIMESTAMP": + case "TIMESTAMP_LTZ": + return new TimestampMicrosecond("UTC"); + case "TIMESTAMP_NTZ": + return new TimestampMicrosecond(); + case "VOID": + case "NULL": + return new Null(); + default: + return new Utf8(); + } + } + + private makeParameterized(upper: string, args: number[]): DataType { + switch (upper) { + case "DECIMAL": + case "NUMERIC": + case "DEC": { + const precision = args[0] ?? 10; + const scale = args[1] ?? 0; + // Arrow JS Decimal constructor signature is (scale, precision, bitWidth). + return new Decimal(scale, precision, 128); + } + case "VARCHAR": + case "CHAR": + return new Utf8(); + default: + return new Utf8(); + } + } + + private makeGeneric(upper: string): DataType { + switch (upper) { + case "ARRAY": { + const inner = this.parseType(); + return new List(new Field("item", inner, true)); + } + case "MAP": { + const keyType = this.parseType(); + this.skipWs(); + this.consume(","); + this.skipWs(); + const valueType = this.parseType(); + const entriesStruct = new Struct([ + new Field("key", keyType, false), + new Field("value", valueType, true), + ]); + return new Map_(new Field("entries", entriesStruct, false), false); + } + case "STRUCT": + return this.parseStructFields(); + default: + // Unknown generic — skip to matching '>' and fall back. + this.skipBalancedAngles(); + return new Utf8(); + } + } + + private parseStructFields(): DataType { + const fields: Field[] = []; + while (true) { + this.skipWs(); + if (this.peek() === ">") break; + + let name: string; + if (this.peek() === "`") { + name = this.consumeBacktickIdent(); + } else { + name = this.consumeIdent(); + } + + this.skipWs(); + this.consume(":"); + this.skipWs(); + + const type = this.parseType(); + + // Optional `NOT NULL` and `COMMENT '...'`. Both are accepted by + // Databricks DDL and may appear in `type_text`. + this.skipWs(); + while (this.peekKeyword("NOT")) { + this.consumeIdent(); + this.skipWs(); + if (this.peekKeyword("NULL")) { + this.consumeIdent(); + } + this.skipWs(); + } + if (this.peekKeyword("COMMENT")) { + this.consumeIdent(); + this.skipWs(); + this.consumeStringLiteral(); + this.skipWs(); + } + + fields.push(new Field(name, type, true)); + + this.skipWs(); + if (this.peek() === ",") { + this.consume(","); + } else { + break; + } + } + return new Struct(fields); + } + + private parseInterval(): DataType { + // Grammar: INTERVAL [TO ] + // YEAR / MONTH variants -> IntervalYearMonth + // DAY / HOUR / MINUTE / SECOND variants -> Duration(microsecond) + const seen: string[] = []; + while (this.pos < this.input.length) { + this.skipWs(); + const c = this.peek(); + if (c === "" || c === "," || c === ">" || c === ")") break; + const word = this.consumeIdent().toUpperCase(); + seen.push(word); + } + const isYearMonth = seen.some((w) => w === "YEAR" || w === "MONTH"); + return isYearMonth ? new IntervalYearMonth() : new DurationMicrosecond(); + } + + private parseNumberArgs(): number[] { + const args: number[] = []; + while (true) { + this.skipWs(); + if (this.peek() === ")") break; + args.push(this.consumeNumber()); + this.skipWs(); + if (this.peek() === ",") { + this.consume(","); + } else { + break; + } + } + return args; + } + + // ─── Token utilities ───────────────────────────────────── + + private peek(): string { + return this.input[this.pos] ?? ""; + } + + private peekKeyword(word: string): boolean { + const slice = this.input.slice(this.pos, this.pos + word.length); + if (slice.toUpperCase() !== word.toUpperCase()) return false; + // Must be followed by a non-identifier character (boundary check). + const next = this.input[this.pos + word.length] ?? ""; + return !/[A-Za-z0-9_]/.test(next); + } + + private consume(expected: string): void { + if (this.peek() !== expected) { + throw new Error( + `Expected "${expected}" at position ${this.pos}, got "${this.peek()}" in "${this.input}"`, + ); + } + this.pos++; + } + + private skipWs(): void { + while ( + this.pos < this.input.length && + /\s/.test(this.input[this.pos] ?? "") + ) { + this.pos++; + } + } + + private consumeIdent(): string { + const start = this.pos; + while ( + this.pos < this.input.length && + /[A-Za-z0-9_]/.test(this.input[this.pos] ?? "") + ) { + this.pos++; + } + if (this.pos === start) { + throw new Error( + `Expected identifier at position ${this.pos}, got "${this.peek()}" in "${this.input}"`, + ); + } + return this.input.slice(start, this.pos); + } + + private consumeBacktickIdent(): string { + this.consume("`"); + let value = ""; + while (this.pos < this.input.length) { + if (this.input[this.pos] === "`") { + if (this.input[this.pos + 1] === "`") { + value += "`"; + this.pos += 2; + continue; + } + break; + } + value += this.input[this.pos]; + this.pos++; + } + this.consume("`"); + return value; + } + + private consumeNumber(): number { + const start = this.pos; + while ( + this.pos < this.input.length && + /[0-9]/.test(this.input[this.pos] ?? "") + ) { + this.pos++; + } + if (this.pos === start) { + throw new Error( + `Expected number at position ${this.pos}, got "${this.peek()}" in "${this.input}"`, + ); + } + return Number.parseInt(this.input.slice(start, this.pos), 10); + } + + private consumeStringLiteral(): string { + const quote = this.peek(); + if (quote !== "'" && quote !== '"') { + throw new Error( + `Expected string literal at position ${this.pos}, got "${quote}" in "${this.input}"`, + ); + } + this.pos++; + let value = ""; + while (this.pos < this.input.length) { + const c = this.input[this.pos]; + if (c === "\\") { + // Escape sequence: keep the next char verbatim. + const next = this.input[this.pos + 1]; + if (next !== undefined) { + value += next; + this.pos += 2; + continue; + } + this.pos++; + continue; + } + if (c === quote) { + this.pos++; + return value; + } + value += c; + this.pos++; + } + throw new Error(`Unterminated string literal in "${this.input}"`); + } + + private skipBalancedAngles(): void { + let depth = 1; + while (this.pos < this.input.length && depth > 0) { + const c = this.peek(); + if (c === "<") depth++; + else if (c === ">") { + depth--; + if (depth === 0) return; + } + this.pos++; + } + } +} diff --git a/packages/appkit/src/connectors/sql-warehouse/client.ts b/packages/appkit/src/connectors/sql-warehouse/client.ts index df7f0178e..76f1b219c 100644 --- a/packages/appkit/src/connectors/sql-warehouse/client.ts +++ b/packages/appkit/src/connectors/sql-warehouse/client.ts @@ -21,6 +21,7 @@ import { SpanStatusCode, TelemetryManager, } from "../../telemetry"; +import { buildEmptyArrowIPCBase64 } from "./arrow-schema"; import { executeStatementDefaults } from "./defaults"; const logger = createLogger("connectors:sql-warehouse"); @@ -413,9 +414,23 @@ export class SQLWarehouseConnector { return this.updateWithArrowStatus(response); } - // Inline data_array: fall through to the row transform below. - // (Anything else — empty result with no attachment, data_array, or - // external_links — also falls through and produces { data: [] }.) + // Empty result with a known schema: synthesize a zero-row Arrow IPC + // attachment so the client always receives an Arrow Table for + // ARROW_STREAM, regardless of whether the warehouse returned data. + if (!result?.data_array && response.manifest?.schema?.columns) { + const synthesized = buildEmptyArrowIPCBase64( + response.manifest.schema.columns, + ); + return { + ...response, + result: { ...(result ?? {}), attachment: synthesized }, + }; + } + + // Inline data_array under ARROW_STREAM (rare): fall through to the + // row transform below. The hook will receive `type: "result"` rows; + // callers asking for ARROW_STREAM should not hit this path with + // current Databricks warehouses. } if (!response.result?.data_array || !response.manifest?.schema?.columns) { diff --git a/packages/appkit/src/connectors/sql-warehouse/tests/arrow-schema.test.ts b/packages/appkit/src/connectors/sql-warehouse/tests/arrow-schema.test.ts new file mode 100644 index 000000000..e30b7315a --- /dev/null +++ b/packages/appkit/src/connectors/sql-warehouse/tests/arrow-schema.test.ts @@ -0,0 +1,514 @@ +import { + Binary, + Bool, + type DataType, + DateDay, + Decimal, + DurationMicrosecond, + Float32, + Float64, + Int8, + Int16, + Int32, + Int64, + IntervalYearMonth, + List, + Map_, + Null, + Struct, + TimestampMicrosecond, + Type, + tableFromIPC, + Utf8, +} from "apache-arrow"; +import { describe, expect, test } from "vitest"; +import { buildEmptyArrowIPCBase64, parseDatabricksType } from "../arrow-schema"; + +// ============================================================================ +// Helpers +// ============================================================================ + +/** Walk the type tree and produce a stable string representation for assertions. */ +function typeSummary(t: DataType): string { + if (t instanceof Decimal) return `Decimal(${t.precision},${t.scale})`; + if (t instanceof TimestampMicrosecond) { + const tz = (t as TimestampMicrosecond & { timezone?: string }).timezone; + return tz ? `Timestamp[us,${tz}]` : "Timestamp[us]"; + } + if (t instanceof List) { + const inner = (t.children?.[0]?.type as DataType | undefined) ?? new Utf8(); + return `List<${typeSummary(inner)}>`; + } + if (t instanceof Struct) { + const inner = (t.children ?? []) + .map((f) => `${f.name}:${typeSummary(f.type as DataType)}`) + .join(","); + return `Struct<${inner}>`; + } + if (t instanceof Map_) { + const entries = + (t.children?.[0]?.type as Struct | undefined)?.children ?? []; + const k = entries[0]?.type as DataType | undefined; + const v = entries[1]?.type as DataType | undefined; + return `Map<${typeSummary(k ?? new Utf8())},${typeSummary(v ?? new Utf8())}>`; + } + // Fall back to typeId for primitives. + return Type[t.typeId] ?? t.constructor.name; +} + +// ============================================================================ +// Scalar types +// ============================================================================ + +describe("parseDatabricksType — scalars", () => { + test.each([ + ["STRING", Utf8], + ["VARIANT", Utf8], + ["BINARY", Binary], + ["GEOGRAPHY", Binary], + ["GEOMETRY", Binary], + ["BOOLEAN", Bool], + ["BOOL", Bool], + ["TINYINT", Int8], + ["BYTE", Int8], + ["SMALLINT", Int16], + ["SHORT", Int16], + ["INT", Int32], + ["INTEGER", Int32], + ["BIGINT", Int64], + ["LONG", Int64], + ["FLOAT", Float32], + ["REAL", Float32], + ["DOUBLE", Float64], + ["DATE", DateDay], + ["VOID", Null], + ["NULL", Null], + ] as const)("%s parses to expected type", (input, ctor) => { + const t = parseDatabricksType(input); + expect(t).toBeInstanceOf(ctor); + }); + + test("case-insensitive — lowercase is accepted", () => { + expect(parseDatabricksType("string")).toBeInstanceOf(Utf8); + expect(parseDatabricksType("bigint")).toBeInstanceOf(Int64); + }); + + test("TIMESTAMP defaults to UTC tz", () => { + const t = parseDatabricksType("TIMESTAMP") as TimestampMicrosecond; + expect(t).toBeInstanceOf(TimestampMicrosecond); + expect(t.timezone).toBe("UTC"); + }); + + test("TIMESTAMP_LTZ behaves like TIMESTAMP", () => { + const t = parseDatabricksType("TIMESTAMP_LTZ") as TimestampMicrosecond; + expect(t.timezone).toBe("UTC"); + }); + + test("TIMESTAMP_NTZ has no timezone", () => { + const t = parseDatabricksType("TIMESTAMP_NTZ") as TimestampMicrosecond; + expect(t).toBeInstanceOf(TimestampMicrosecond); + expect(t.timezone == null || t.timezone === "").toBe(true); + }); + + test("Unknown scalar falls back to Utf8 (degraded but doesn't throw)", () => { + expect(parseDatabricksType("SOMETHING_NEW")).toBeInstanceOf(Utf8); + }); +}); + +// ============================================================================ +// Parameterized scalars +// ============================================================================ + +describe("parseDatabricksType — parameterized scalars", () => { + test("VARCHAR(255) → Utf8 (Arrow doesn't track string length)", () => { + expect(parseDatabricksType("VARCHAR(255)")).toBeInstanceOf(Utf8); + }); + + test("CHAR(10) → Utf8", () => { + expect(parseDatabricksType("CHAR(10)")).toBeInstanceOf(Utf8); + }); + + test("DECIMAL(10,2) → Decimal(precision=10, scale=2)", () => { + const t = parseDatabricksType("DECIMAL(10,2)") as Decimal; + expect(t).toBeInstanceOf(Decimal); + expect(t.precision).toBe(10); + expect(t.scale).toBe(2); + }); + + test("DECIMAL(38,0) — max precision, no scale", () => { + const t = parseDatabricksType("DECIMAL(38,0)") as Decimal; + expect(t.precision).toBe(38); + expect(t.scale).toBe(0); + }); + + test("NUMERIC(p,s) is an alias for DECIMAL(p,s)", () => { + const t = parseDatabricksType("NUMERIC(15,4)") as Decimal; + expect(t).toBeInstanceOf(Decimal); + expect(t.precision).toBe(15); + expect(t.scale).toBe(4); + }); + + test("DEC(p,s) is an alias for DECIMAL(p,s)", () => { + const t = parseDatabricksType("DEC(7,3)") as Decimal; + expect(t.precision).toBe(7); + expect(t.scale).toBe(3); + }); + + test("DECIMAL with whitespace inside parens", () => { + const t = parseDatabricksType("DECIMAL( 10 , 2 )") as Decimal; + expect(t.precision).toBe(10); + expect(t.scale).toBe(2); + }); + + test("DECIMAL with single arg (precision only) defaults scale=0", () => { + const t = parseDatabricksType("DECIMAL(20)") as Decimal; + expect(t.precision).toBe(20); + expect(t.scale).toBe(0); + }); + + test("Bare DECIMAL falls back to default precision/scale", () => { + const t = parseDatabricksType("DECIMAL") as Decimal; + expect(t).toBeInstanceOf(Decimal); + expect(typeof t.precision).toBe("number"); + expect(typeof t.scale).toBe("number"); + }); +}); + +// ============================================================================ +// INTERVAL types +// ============================================================================ + +describe("parseDatabricksType — INTERVAL", () => { + test("INTERVAL YEAR → IntervalYearMonth", () => { + expect(parseDatabricksType("INTERVAL YEAR")).toBeInstanceOf( + IntervalYearMonth, + ); + }); + + test("INTERVAL MONTH → IntervalYearMonth", () => { + expect(parseDatabricksType("INTERVAL MONTH")).toBeInstanceOf( + IntervalYearMonth, + ); + }); + + test("INTERVAL YEAR TO MONTH → IntervalYearMonth", () => { + expect(parseDatabricksType("INTERVAL YEAR TO MONTH")).toBeInstanceOf( + IntervalYearMonth, + ); + }); + + test("INTERVAL DAY → DurationMicrosecond", () => { + expect(parseDatabricksType("INTERVAL DAY")).toBeInstanceOf( + DurationMicrosecond, + ); + }); + + test("INTERVAL DAY TO SECOND → DurationMicrosecond", () => { + expect(parseDatabricksType("INTERVAL DAY TO SECOND")).toBeInstanceOf( + DurationMicrosecond, + ); + }); + + test("INTERVAL HOUR TO MINUTE → DurationMicrosecond", () => { + expect(parseDatabricksType("INTERVAL HOUR TO MINUTE")).toBeInstanceOf( + DurationMicrosecond, + ); + }); +}); + +// ============================================================================ +// ARRAY +// ============================================================================ + +describe("parseDatabricksType — ARRAY", () => { + test("ARRAY → List", () => { + const t = parseDatabricksType("ARRAY") as List; + expect(t).toBeInstanceOf(List); + expect(t.children?.[0]?.type).toBeInstanceOf(Utf8); + }); + + test("ARRAY → List", () => { + const t = parseDatabricksType("ARRAY") as List; + expect(t.children?.[0]?.type).toBeInstanceOf(Int32); + }); + + test("ARRAY preserves precision/scale", () => { + const t = parseDatabricksType("ARRAY") as List; + const inner = t.children?.[0]?.type as Decimal; + expect(inner).toBeInstanceOf(Decimal); + expect(inner.precision).toBe(10); + expect(inner.scale).toBe(2); + }); + + test("ARRAY> — nested twice", () => { + const t = parseDatabricksType("ARRAY>") as List; + const inner1 = t.children?.[0]?.type as List; + expect(inner1).toBeInstanceOf(List); + expect(inner1.children?.[0]?.type).toBeInstanceOf(Int32); + }); + + test("ARRAY>> — three levels deep", () => { + expect( + typeSummary(parseDatabricksType("ARRAY>>")), + ).toBe("List>>"); + }); + + test("ARRAY with whitespace", () => { + const t = parseDatabricksType("ARRAY < STRING >") as List; + expect(t.children?.[0]?.type).toBeInstanceOf(Utf8); + }); +}); + +// ============================================================================ +// MAP +// ============================================================================ + +describe("parseDatabricksType — MAP", () => { + test("MAP", () => { + expect(typeSummary(parseDatabricksType("MAP"))).toBe( + "Map", + ); + }); + + test("MAP — with whitespace", () => { + expect(typeSummary(parseDatabricksType("MAP"))).toBe( + "Map", + ); + }); + + test("MAP> — value is nested", () => { + expect(typeSummary(parseDatabricksType("MAP>"))).toBe( + "Map>", + ); + }); + + test("MAP> — fully nested", () => { + expect( + typeSummary(parseDatabricksType("MAP>")), + ).toBe("Map>"); + }); +}); + +// ============================================================================ +// STRUCT +// ============================================================================ + +describe("parseDatabricksType — STRUCT", () => { + test("STRUCT", () => { + const t = parseDatabricksType("STRUCT") as Struct; + expect(t).toBeInstanceOf(Struct); + expect(t.children?.length).toBe(2); + expect(t.children?.[0]?.name).toBe("a"); + expect(t.children?.[0]?.type).toBeInstanceOf(Int32); + expect(t.children?.[1]?.name).toBe("b"); + expect(t.children?.[1]?.type).toBeInstanceOf(Utf8); + }); + + test("STRUCT with whitespace and many fields", () => { + const t = parseDatabricksType( + "STRUCT", + ) as Struct; + expect(t.children?.map((f) => f.name)).toEqual(["id", "name", "ts"]); + expect(t.children?.[0]?.type).toBeInstanceOf(Int64); + expect(t.children?.[2]?.type).toBeInstanceOf(TimestampMicrosecond); + }); + + test("STRUCT with COMMENT on a field", () => { + const t = parseDatabricksType( + "STRUCT", + ) as Struct; + expect(t.children?.length).toBe(2); + expect(t.children?.[0]?.name).toBe("id"); + expect(t.children?.[0]?.type).toBeInstanceOf(Int32); + expect(t.children?.[1]?.name).toBe("name"); + }); + + test("STRUCT with COMMENT containing escaped quote", () => { + const t = parseDatabricksType( + "STRUCT", + ) as Struct; + expect(t.children?.length).toBe(2); + expect(t.children?.[0]?.name).toBe("id"); + }); + + test("STRUCT with NOT NULL annotation on a field", () => { + const t = parseDatabricksType( + "STRUCT", + ) as Struct; + expect(t.children?.length).toBe(2); + expect(t.children?.[0]?.name).toBe("id"); + }); + + test("STRUCT with backticked field name", () => { + const t = parseDatabricksType( + "STRUCT<`weird name`:INT, normal:STRING>", + ) as Struct; + expect(t.children?.[0]?.name).toBe("weird name"); + expect(t.children?.[0]?.type).toBeInstanceOf(Int32); + }); + + test("STRUCT with backticked field name containing escaped backtick", () => { + const t = parseDatabricksType( + "STRUCT<`with``tick`:INT, other:STRING>", + ) as Struct; + expect(t.children?.[0]?.name).toBe("with`tick"); + }); + + test("STRUCT with nested STRUCT", () => { + const t = parseDatabricksType( + "STRUCT, name:STRING>", + ) as Struct; + expect(t.children?.length).toBe(2); + const nested = t.children?.[0]?.type as Struct; + expect(nested).toBeInstanceOf(Struct); + expect(nested.children?.[0]?.name).toBe("inner"); + expect(nested.children?.[0]?.type).toBeInstanceOf(Int32); + }); + + test("Empty STRUCT<>", () => { + const t = parseDatabricksType("STRUCT<>") as Struct; + expect(t).toBeInstanceOf(Struct); + expect(t.children?.length).toBe(0); + }); +}); + +// ============================================================================ +// Deep nesting / mixed types +// ============================================================================ + +describe("parseDatabricksType — deeply nested", () => { + test("MAP>>", () => { + expect( + typeSummary( + parseDatabricksType( + "MAP>>", + ), + ), + ).toBe("Map>>"); + }); + + test("ARRAY>>> — 4 levels mixed", () => { + expect( + typeSummary( + parseDatabricksType( + "ARRAY>>>", + ), + ), + ).toBe("List>>>"); + }); +}); + +// ============================================================================ +// Error / robustness behavior +// ============================================================================ + +describe("parseDatabricksType — error / robustness", () => { + test("trailing garbage throws", () => { + expect(() => parseDatabricksType("INT junk")).toThrow(); + }); + + test("unmatched < throws", () => { + expect(() => parseDatabricksType("ARRAY { + expect(() => parseDatabricksType("DECIMAL(10,2")).toThrow(); + }); + + test("empty string throws", () => { + expect(() => parseDatabricksType("")).toThrow(); + }); +}); + +// ============================================================================ +// buildEmptyArrowIPCBase64 — round-trip +// ============================================================================ + +describe("buildEmptyArrowIPCBase64", () => { + test("produces a decodable empty Arrow Table with the right schema", () => { + const columns = [ + { name: "user_id", type_text: "BIGINT" }, + { name: "name", type_text: "STRING" }, + { name: "created_at", type_text: "TIMESTAMP" }, + { name: "balance", type_text: "DECIMAL(10,2)" }, + { name: "active", type_text: "BOOLEAN" }, + ]; + const b64 = buildEmptyArrowIPCBase64(columns); + const buf = Buffer.from(b64, "base64"); + const table = tableFromIPC(buf); + expect(table.numRows).toBe(0); + expect(table.numCols).toBe(5); + expect(table.schema.fields.map((f) => f.name)).toEqual([ + "user_id", + "name", + "created_at", + "balance", + "active", + ]); + expect( + (table.schema.fields[0]?.type as { bitWidth?: number }).bitWidth, + ).toBe(64); + expect(table.schema.fields[1]?.type).toBeInstanceOf(Utf8); + // After IPC round-trip Arrow JS resolves Timestamp* subclasses to a + // generic Timestamp with `unit` and `timezone`; assert structurally. + expect(table.schema.fields[2]?.type.typeId).toBe(Type.Timestamp); + expect((table.schema.fields[2]?.type as { unit?: number }).unit).toBe(2); // TimeUnit.MICROSECOND + const decimal = table.schema.fields[3]?.type as Decimal; + expect(decimal).toBeInstanceOf(Decimal); + expect(decimal.precision).toBe(10); + expect(decimal.scale).toBe(2); + expect(table.schema.fields[4]?.type).toBeInstanceOf(Bool); + }); + + test("round-trips nested types end-to-end", () => { + const columns = [ + { name: "tags", type_text: "ARRAY" }, + { name: "meta", type_text: "STRUCT" }, + { name: "counts", type_text: "MAP" }, + ]; + const buf = Buffer.from(buildEmptyArrowIPCBase64(columns), "base64"); + const table = tableFromIPC(buf); + expect(table.numRows).toBe(0); + expect(table.numCols).toBe(3); + expect(table.schema.fields[0]?.type).toBeInstanceOf(List); + expect(table.schema.fields[1]?.type).toBeInstanceOf(Struct); + expect(table.schema.fields[2]?.type).toBeInstanceOf(Map_); + }); + + test("falls back from type_text to type_name when type_text missing", () => { + const columns = [{ name: "id", type_name: "BIGINT" }]; + const buf = Buffer.from(buildEmptyArrowIPCBase64(columns), "base64"); + const table = tableFromIPC(buf); + expect( + (table.schema.fields[0]?.type as { bitWidth?: number }).bitWidth, + ).toBe(64); + }); + + test("unknown type degrades to Utf8 without throwing", () => { + const columns = [ + { name: "id", type_text: "BIGINT" }, + { name: "weird", type_text: "FUTURE_TYPE_NOT_YET_SUPPORTED" }, + ]; + const buf = Buffer.from(buildEmptyArrowIPCBase64(columns), "base64"); + const table = tableFromIPC(buf); + expect( + (table.schema.fields[0]?.type as { bitWidth?: number }).bitWidth, + ).toBe(64); + expect(table.schema.fields[1]?.type).toBeInstanceOf(Utf8); + }); + + test("missing column name gets a synthesized placeholder", () => { + const columns = [{ type_text: "STRING" }, { name: "", type_text: "INT" }]; + const buf = Buffer.from(buildEmptyArrowIPCBase64(columns), "base64"); + const table = tableFromIPC(buf); + expect(table.schema.fields[0]?.name).toBe("column_0"); + expect(table.schema.fields[1]?.name).toBe("column_1"); + }); + + test("empty schema produces a valid 0-column 0-row Table", () => { + const buf = Buffer.from(buildEmptyArrowIPCBase64([]), "base64"); + const table = tableFromIPC(buf); + expect(table.numRows).toBe(0); + expect(table.numCols).toBe(0); + }); +}); diff --git a/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts b/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts index 5f779ff05..5c5e7a91d 100644 --- a/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts +++ b/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts @@ -1,4 +1,5 @@ import type { sql } from "@databricks/sdk-experimental"; +import { tableFromIPC } from "apache-arrow"; import { describe, expect, test, vi } from "vitest"; vi.mock("../../../telemetry", () => { @@ -213,6 +214,83 @@ describe("SQLWarehouseConnector._transformDataArray", () => { expect(result.result.attachment).toBe(REAL_ARROW_ATTACHMENT); }); + test("synthesizes an empty Arrow IPC attachment for empty results so the client always gets a Table", () => { + const connector = createConnector(); + // Empty result: no attachment, no data_array, no external_links — but + // the manifest still describes the schema. The connector should fill in + // `attachment` with a zero-row Arrow IPC matching the schema. + const response = { + statement_id: "stmt-empty", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { + columns: [ + { name: "user_id", type_text: "BIGINT", type_name: "BIGINT" }, + { name: "name", type_text: "STRING", type_name: "STRING" }, + { + name: "balance", + type_text: "DECIMAL(10,2)", + type_name: "DECIMAL", + }, + ], + }, + total_row_count: 0, + }, + result: {}, + } as unknown as sql.StatementResponse; + + const transformed = (connector as any)._transformDataArray(response); + const attachment: string = transformed.result.attachment; + expect(typeof attachment).toBe("string"); + expect(attachment.length).toBeGreaterThan(0); + + // Verify the synthesized attachment decodes into the right empty schema. + const table = tableFromIPC(Buffer.from(attachment, "base64")); + expect(table.numRows).toBe(0); + expect(table.schema.fields.map((f) => f.name)).toEqual([ + "user_id", + "name", + "balance", + ]); + }); + + test("does NOT synthesize an attachment when external_links are present", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-ext", + status: { state: "SUCCEEDED" }, + manifest: { + format: "ARROW_STREAM", + schema: { columns: [{ name: "x", type_text: "INT" }] }, + }, + result: { + external_links: [ + { external_link: "https://example.com/x", expiration: "9999" }, + ], + }, + } as unknown as sql.StatementResponse; + + const transformed = (connector as any)._transformDataArray(response); + // External-links path returns the statement_id projection — no attachment. + expect(transformed.result.attachment).toBeUndefined(); + expect(transformed.result.statement_id).toBe("stmt-ext"); + }); + + test("does NOT synthesize an attachment when schema is missing", () => { + const connector = createConnector(); + const response = { + statement_id: "stmt-no-schema", + status: { state: "SUCCEEDED" }, + manifest: { format: "ARROW_STREAM" }, + result: {}, + } as unknown as sql.StatementResponse; + + const transformed = (connector as any)._transformDataArray(response); + // Without a schema we cannot build a Table — pass through unchanged. + expect(transformed.result?.attachment).toBeUndefined(); + }); + test("rejects oversized attachments to bound memory", () => { const connector = createConnector(); // 64 MiB cap → ~85 MiB of base64 chars decode to >64 MiB. From aef90421cf5d671ef15f7ec92b3cfac28155c2f6 Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Tue, 28 Apr 2026 17:48:34 +0000 Subject: [PATCH 16/17] fix: align SSE event size with inline Arrow attachment cap (8 MiB) The previous SSE event-size limits (1 MiB on both server and client) were below the connector's 64 MiB cap on inline Arrow IPC attachments. Anything between 1 MiB and 64 MiB would pass the connector check but get rejected at the SSE layer with a confusing "Buffer size exceeded" error. The 64 MiB cap was effectively unreachable. This commit aligns all three caps at 8 MiB: - `streamDefaults.maxEventSize` (server SSE event cap) - `connectSSE.maxBufferSize` (client SSE buffer cap) - `MAX_INLINE_ATTACHMENT_BYTES` (connector inline attachment cap) 8 MiB is well above what analytics queries return in practice (most are well under 1 MiB), gives ~3x headroom for compact ARROW_STREAM payloads, and stays below the Databricks Statement Execution API hard limit on INLINE disposition (~25 MiB). Anything larger than 8 MiB now fails fast at the connector with a typed `ExecutionError`, and ARROW_STREAM falls back to EXTERNAL_LINKS which has no size pressure. A follow-up draft PR explores stash-and-serve via /arrow-result/:jobId as a stronger architectural solution that removes the SSE size cap entirely. Co-authored-by: Isaac Signed-off-by: James Broadhead --- packages/appkit-ui/src/js/sse/connect-sse.ts | 5 ++++- packages/appkit/src/connectors/sql-warehouse/client.ts | 10 ++++++++-- .../src/connectors/sql-warehouse/tests/client.test.ts | 4 ++-- packages/appkit/src/stream/defaults.ts | 8 +++++++- 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/packages/appkit-ui/src/js/sse/connect-sse.ts b/packages/appkit-ui/src/js/sse/connect-sse.ts index c4fd4500d..089ddf579 100644 --- a/packages/appkit-ui/src/js/sse/connect-sse.ts +++ b/packages/appkit-ui/src/js/sse/connect-sse.ts @@ -18,7 +18,10 @@ export async function connectSSE( lastEventId: initialLastEventId = null, retryDelay = 2000, maxRetries = 3, - maxBufferSize = 1024 * 1024, // 1MB + // 8 MiB — sized to receive inline Arrow IPC attachments from + // ARROW_STREAM analytics responses; matches the server's stream + // `maxEventSize`. Most events are well under 1 MiB in practice. + maxBufferSize = 8 * 1024 * 1024, timeout = 300000, // 5 minutes onError, } = options; diff --git a/packages/appkit/src/connectors/sql-warehouse/client.ts b/packages/appkit/src/connectors/sql-warehouse/client.ts index 76f1b219c..c6a9011e7 100644 --- a/packages/appkit/src/connectors/sql-warehouse/client.ts +++ b/packages/appkit/src/connectors/sql-warehouse/client.ts @@ -26,8 +26,14 @@ import { executeStatementDefaults } from "./defaults"; const logger = createLogger("connectors:sql-warehouse"); -/** Maximum size for inline Arrow IPC attachments (64 MiB decoded). */ -const MAX_INLINE_ATTACHMENT_BYTES = 64 * 1024 * 1024; +/** + * Maximum size for inline Arrow IPC attachments (8 MiB decoded). + * Aligned with `streamDefaults.maxEventSize` so anything that would exceed + * the SSE event cap fails here with a clear error rather than a confusing + * "Buffer size exceeded" downstream. Larger results should use + * `disposition: "EXTERNAL_LINKS"`, which the analytics fallback handles. + */ +const MAX_INLINE_ATTACHMENT_BYTES = 8 * 1024 * 1024; interface SQLWarehouseConfig { timeout?: number; diff --git a/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts b/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts index 5c5e7a91d..c7f73c988 100644 --- a/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts +++ b/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts @@ -293,8 +293,8 @@ describe("SQLWarehouseConnector._transformDataArray", () => { test("rejects oversized attachments to bound memory", () => { const connector = createConnector(); - // 64 MiB cap → ~85 MiB of base64 chars decode to >64 MiB. - const oversized = "A".repeat(90 * 1024 * 1024); + // 8 MiB decoded cap → ~12 MiB of base64 chars decodes to >8 MiB. + const oversized = "A".repeat(12 * 1024 * 1024); const response = { statement_id: "stmt-oversized", status: { state: "SUCCEEDED" }, diff --git a/packages/appkit/src/stream/defaults.ts b/packages/appkit/src/stream/defaults.ts index c8fc91591..9212ebca3 100644 --- a/packages/appkit/src/stream/defaults.ts +++ b/packages/appkit/src/stream/defaults.ts @@ -1,6 +1,12 @@ export const streamDefaults = { bufferSize: 100, - maxEventSize: 1024 * 1024, // 1MB + // 8 MiB. Sized to fit base64-encoded inline Arrow IPC attachments from + // serverless warehouses (analytics queries typically return well under 1 MiB, + // but ARROW_STREAM + INLINE can carry up to ~25 MiB per the Databricks API). + // The connector enforces the same cap (`MAX_INLINE_ATTACHMENT_BYTES`) so + // anything that would exceed this fails fast at the connector with a clear + // error rather than a confusing SSE buffer-exceeded. + maxEventSize: 8 * 1024 * 1024, bufferTTL: 10 * 60 * 1000, // 10 minutes cleanupInterval: 5 * 60 * 1000, // 5 minutes maxPersistentBuffers: 10000, // 10000 buffers From 6e8b12e8abb37818c9c8770d100d0164b8051c3e Mon Sep 17 00:00:00 2001 From: James Broadhead Date: Tue, 28 Apr 2026 17:57:03 +0000 Subject: [PATCH 17/17] =?UTF-8?q?refactor:=20proposal=20=E2=80=94=20stash?= =?UTF-8?q?=20inline=20Arrow=20IPC,=20serve=20via=20/arrow-result?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DRAFT proposal. Replaces the `arrow_inline` SSE message type from #256 with an out-of-band delivery mechanism. Motivation: remove the SSE event-size cap as a constraint on inline Arrow result size. Architecture ------------ ARROW_STREAM responses are now delivered uniformly: 1. The connector returns the base64 Arrow IPC attachment in `result.attachment` for INLINE responses (unchanged from #256). 2. The analytics route base64-decodes it once, stashes the resulting buffer in `InlineArrowStash` (bounded LRU + TTL), and emits the same `{ type: "arrow", statement_id }` SSE message that EXTERNAL_LINKS already uses — with a synthetic id prefixed `inline-`. 3. The client (unchanged from main) calls `/arrow-result/:jobId` for any `type: "arrow"` message. 4. The route handler checks the stash first; if the id has the `inline-` prefix and is still present, it serves the bytes from memory. Otherwise it falls through to the existing warehouse fetch path for real EXTERNAL_LINKS statement_ids. The `arrow_inline` SSE message type, the client-side base64 decode helper, and the SSE buffer-size bumps from #256 all go away. Reasoning --------- SSE was designed for streams of small control messages. Pushing multi-MB Arrow IPC bytes through it has two unavoidable costs: - single-event memory ceiling on both server and client, - proxy/load-balancer compatibility issues for large events. Bumping the SSE buffer (5b in the design discussion) raises both caps but doesn't fix the architectural mismatch. This PR moves bulk bytes to HTTP, where they belong: - HTTP/2 streaming, gzip, and browser background-fetch all work naturally. - SSE buffer can stay at the conservative 1 MiB default. - The wire protocol unifies INLINE and EXTERNAL_LINKS — the client has a single code path for ARROW_STREAM data. - Inline result size cap goes back up to the Databricks API limit (25 MiB) instead of being constrained by SSE. Tradeoffs --------- - Server holds IPC buffers in memory until the client fetches them (one-shot `take()` removes on read; passive TTL evicts otherwise). For 100 entries × 25 MiB worst case, that's 2.5 GiB peak — but steady state is much smaller because entries are removed as soon as the client fetches. - Single-process only. A multi-server deployment would need a shared store (Redis or equivalent) keyed on the synthetic id. The stash interface is small enough to swap implementations. - One-shot reads mean the client cannot retry a failed fetch. Acceptable for our use case (warehouse query returned data, the hook either succeeds in fetching or surfaces a generic error). Status ------ DRAFT: opened to discuss the architectural direction before landing. #256 ships with the simpler 5b SSE-buffer-bump approach (8 MiB) which is sufficient for the realistic analytics range. This proposal is the cleaner long-term solution if the team wants to invest in the server-side state. Co-authored-by: Isaac Signed-off-by: James Broadhead --- packages/appkit-ui/src/js/sse/connect-sse.ts | 5 +- .../__tests__/use-analytics-query.test.ts | 82 +++++-------- .../src/react/hooks/use-analytics-query.ts | 44 +------ .../src/connectors/sql-warehouse/client.ts | 12 +- .../sql-warehouse/tests/client.test.ts | 4 +- .../appkit/src/plugins/analytics/analytics.ts | 34 +++++- .../plugins/analytics/inline-arrow-stash.ts | 101 ++++++++++++++++ .../plugins/analytics/tests/analytics.test.ts | 15 ++- .../tests/inline-arrow-stash.test.ts | 112 ++++++++++++++++++ packages/appkit/src/stream/defaults.ts | 12 +- 10 files changed, 296 insertions(+), 125 deletions(-) create mode 100644 packages/appkit/src/plugins/analytics/inline-arrow-stash.ts create mode 100644 packages/appkit/src/plugins/analytics/tests/inline-arrow-stash.test.ts diff --git a/packages/appkit-ui/src/js/sse/connect-sse.ts b/packages/appkit-ui/src/js/sse/connect-sse.ts index 089ddf579..c4fd4500d 100644 --- a/packages/appkit-ui/src/js/sse/connect-sse.ts +++ b/packages/appkit-ui/src/js/sse/connect-sse.ts @@ -18,10 +18,7 @@ export async function connectSSE( lastEventId: initialLastEventId = null, retryDelay = 2000, maxRetries = 3, - // 8 MiB — sized to receive inline Arrow IPC attachments from - // ARROW_STREAM analytics responses; matches the server's stream - // `maxEventSize`. Most events are well under 1 MiB in practice. - maxBufferSize = 8 * 1024 * 1024, + maxBufferSize = 1024 * 1024, // 1MB timeout = 300000, // 5 minutes onError, } = options; diff --git a/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts b/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts index 81159792a..e93c695d1 100644 --- a/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts +++ b/packages/appkit-ui/src/react/hooks/__tests__/use-analytics-query.test.ts @@ -30,93 +30,65 @@ describe("useAnalyticsQuery", () => { lastConnectArgs = null; }); - test("decodes arrow_inline base64 attachment via ArrowClient.processArrowBuffer", async () => { - const fakeTable = { numRows: 1, schema: { fields: [] } }; + test("fetches Arrow IPC via /arrow-result for type:arrow (covers both inline-stash and external-link paths)", async () => { + const fakeTable = { numRows: 0, schema: { fields: [] } }; + mockFetchArrow.mockResolvedValueOnce(new Uint8Array([1, 2, 3])); mockProcessArrowBuffer.mockResolvedValueOnce(fakeTable); - // 'AQID' decodes to bytes [1, 2, 3]. - const base64 = "AQID"; - const { result } = renderHook(() => useAnalyticsQuery("q", null, { format: "ARROW_STREAM" }), ); - // Drive the SSE onMessage handler with an arrow_inline payload. + // Server emits the same {type:"arrow", statement_id} shape regardless of + // whether the bytes came from the warehouse (EXTERNAL_LINKS) or were + // stashed locally (INLINE). await lastConnectArgs.onMessage({ - data: JSON.stringify({ type: "arrow_inline", attachment: base64 }), + data: JSON.stringify({ type: "arrow", statement_id: "inline-abc" }), }); await waitFor(() => { expect(result.current.data).toBe(fakeTable); }); - - expect(mockProcessArrowBuffer).toHaveBeenCalledTimes(1); - const passedBuffer = mockProcessArrowBuffer.mock.calls[0][0] as Uint8Array; - expect(passedBuffer).toBeInstanceOf(Uint8Array); - expect(Array.from(passedBuffer)).toEqual([1, 2, 3]); - // Inline path must NOT trigger a network fetch. - expect(mockFetchArrow).not.toHaveBeenCalled(); + expect(mockFetchArrow).toHaveBeenCalledTimes(1); + expect(mockFetchArrow.mock.calls[0][0]).toBe( + "/api/analytics/arrow-result/inline-abc", + ); }); - test("surfaces an error when arrow_inline decode fails", async () => { - mockProcessArrowBuffer.mockRejectedValueOnce(new Error("bad ipc")); - + test("still handles type:result rows for JSON_ARRAY", async () => { const { result } = renderHook(() => - useAnalyticsQuery("q", null, { format: "ARROW_STREAM" }), + useAnalyticsQuery("q", null, { format: "JSON_ARRAY" }), ); await lastConnectArgs.onMessage({ - data: JSON.stringify({ type: "arrow_inline", attachment: "AQID" }), + data: JSON.stringify({ + type: "result", + data: [{ id: 1 }, { id: 2 }], + }), }); await waitFor(() => { - expect(result.current.error).toBe( - "Unable to load data, please try again", - ); + expect(result.current.data).toEqual([{ id: 1 }, { id: 2 }]); }); - expect(result.current.loading).toBe(false); + expect(mockProcessArrowBuffer).not.toHaveBeenCalled(); }); - test("rejects arrow_inline with missing/empty/non-string attachment without crashing atob", async () => { - const cases: Array = [undefined, null, "", 123, { foo: "bar" }]; - - for (const attachment of cases) { - mockProcessArrowBuffer.mockClear(); - const { result, unmount } = renderHook(() => - useAnalyticsQuery("q", null, { format: "ARROW_STREAM" }), - ); - - await lastConnectArgs.onMessage({ - data: JSON.stringify({ type: "arrow_inline", attachment }), - }); - - await waitFor(() => { - expect(result.current.error).toBe( - "Unable to load data, please try again", - ); - }); - // Critically: must NOT call processArrowBuffer (or atob) on the bad input. - expect(mockProcessArrowBuffer).not.toHaveBeenCalled(); - - unmount(); - } - }); + test("surfaces an error when /arrow-result fetch fails", async () => { + mockFetchArrow.mockRejectedValueOnce(new Error("HTTP 404")); - test("still handles type:result rows for JSON_ARRAY", async () => { const { result } = renderHook(() => - useAnalyticsQuery("q", null, { format: "JSON_ARRAY" }), + useAnalyticsQuery("q", null, { format: "ARROW_STREAM" }), ); await lastConnectArgs.onMessage({ - data: JSON.stringify({ - type: "result", - data: [{ id: 1 }, { id: 2 }], - }), + data: JSON.stringify({ type: "arrow", statement_id: "inline-stale" }), }); await waitFor(() => { - expect(result.current.data).toEqual([{ id: 1 }, { id: 2 }]); + expect(result.current.error).toBe( + "Unable to load data, please try again", + ); }); - expect(mockProcessArrowBuffer).not.toHaveBeenCalled(); + expect(result.current.loading).toBe(false); }); }); diff --git a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts index 1817699d8..3c7038232 100644 --- a/packages/appkit-ui/src/react/hooks/use-analytics-query.ts +++ b/packages/appkit-ui/src/react/hooks/use-analytics-query.ts @@ -22,16 +22,6 @@ function getArrowStreamUrl(id: string) { return `/api/analytics/arrow-result/${id}`; } -/** Decode a base64 string into a Uint8Array suitable for Arrow IPC parsing. */ -function decodeBase64(b64: string): Uint8Array { - const binary = atob(b64); - const bytes = new Uint8Array(binary.length); - for (let i = 0; i < binary.length; i++) { - bytes[i] = binary.charCodeAt(i); - } - return bytes; -} - /** * Subscribe to an analytics query over SSE and returns its latest result. * Integration hook between client and analytics plugin. @@ -139,7 +129,9 @@ export function useAnalyticsQuery< return; } - // success - Arrow format (external links: fetch from server) + // success - Arrow format. The server delivers Arrow IPC bytes via + // /arrow-result/:jobId for both INLINE (stashed server-side) and + // EXTERNAL_LINKS (forwarded from the warehouse) responses. if (parsed.type === "arrow") { try { const arrowData = await ArrowClient.fetchArrow( @@ -161,36 +153,6 @@ export function useAnalyticsQuery< } } - // success - Arrow format (inline: decode base64 IPC payload locally) - if (parsed.type === "arrow_inline") { - if ( - typeof parsed.attachment !== "string" || - parsed.attachment.length === 0 - ) { - console.error( - "[useAnalyticsQuery] arrow_inline message missing attachment", - ); - setLoading(false); - setError("Unable to load data, please try again"); - return; - } - try { - const buffer = decodeBase64(parsed.attachment); - const table = await ArrowClient.processArrowBuffer(buffer); - setLoading(false); - setData(table as ResultType); - return; - } catch (error) { - console.error( - "[useAnalyticsQuery] Failed to decode inline Arrow data", - error, - ); - setLoading(false); - setError("Unable to load data, please try again"); - return; - } - } - // error if (parsed.type === "error" || parsed.error || parsed.code) { const errorMsg = diff --git a/packages/appkit/src/connectors/sql-warehouse/client.ts b/packages/appkit/src/connectors/sql-warehouse/client.ts index c6a9011e7..c47cb8693 100644 --- a/packages/appkit/src/connectors/sql-warehouse/client.ts +++ b/packages/appkit/src/connectors/sql-warehouse/client.ts @@ -27,13 +27,13 @@ import { executeStatementDefaults } from "./defaults"; const logger = createLogger("connectors:sql-warehouse"); /** - * Maximum size for inline Arrow IPC attachments (8 MiB decoded). - * Aligned with `streamDefaults.maxEventSize` so anything that would exceed - * the SSE event cap fails here with a clear error rather than a confusing - * "Buffer size exceeded" downstream. Larger results should use - * `disposition: "EXTERNAL_LINKS"`, which the analytics fallback handles. + * Maximum size for inline Arrow IPC attachments (25 MiB decoded). + * Matches the Databricks Statement Execution API hard cap on INLINE + * disposition. The bytes are stashed server-side (see InlineArrowStash) and + * served out of band via /arrow-result/:jobId, so the SSE event-size cap + * does not apply here. */ -const MAX_INLINE_ATTACHMENT_BYTES = 8 * 1024 * 1024; +const MAX_INLINE_ATTACHMENT_BYTES = 25 * 1024 * 1024; interface SQLWarehouseConfig { timeout?: number; diff --git a/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts b/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts index c7f73c988..84ae0b54e 100644 --- a/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts +++ b/packages/appkit/src/connectors/sql-warehouse/tests/client.test.ts @@ -293,8 +293,8 @@ describe("SQLWarehouseConnector._transformDataArray", () => { test("rejects oversized attachments to bound memory", () => { const connector = createConnector(); - // 8 MiB decoded cap → ~12 MiB of base64 chars decodes to >8 MiB. - const oversized = "A".repeat(12 * 1024 * 1024); + // 25 MiB decoded cap → ~36 MiB of base64 chars decodes to >25 MiB. + const oversized = "A".repeat(36 * 1024 * 1024); const response = { statement_id: "stmt-oversized", status: { state: "SUCCEEDED" }, diff --git a/packages/appkit/src/plugins/analytics/analytics.ts b/packages/appkit/src/plugins/analytics/analytics.ts index 56384bf86..44fd234ae 100644 --- a/packages/appkit/src/plugins/analytics/analytics.ts +++ b/packages/appkit/src/plugins/analytics/analytics.ts @@ -12,6 +12,7 @@ import { createLogger } from "../../logging/logger"; import { Plugin, toPlugin } from "../../plugin"; import type { PluginManifest } from "../../registry"; import { queryDefaults } from "./defaults"; +import { InlineArrowStash } from "./inline-arrow-stash"; import manifest from "./manifest.json"; import { QueryProcessor } from "./query"; import type { @@ -33,11 +34,13 @@ export class AnalyticsPlugin extends Plugin { // analytics services private SQLClient: SQLWarehouseConnector; private queryProcessor: QueryProcessor; + private inlineStash: InlineArrowStash; constructor(config: IAnalyticsConfig) { super(config); this.config = config; this.queryProcessor = new QueryProcessor(); + this.inlineStash = new InlineArrowStash(); this.SQLClient = new SQLWarehouseConnector({ timeout: config.timeout, @@ -76,6 +79,24 @@ export class AnalyticsPlugin extends Plugin { ): Promise { try { const { jobId } = req.params; + + // Inline path: ARROW_STREAM + INLINE responses are stashed by the query + // route and served from memory rather than fetched from the warehouse. + const stashed = this.inlineStash.take(jobId); + if (stashed) { + res.setHeader("Content-Type", "application/octet-stream"); + res.setHeader("Content-Length", stashed.length.toString()); + // Don't cache — stash entries are one-shot. + res.setHeader("Cache-Control", "no-store"); + logger.debug( + "Serving inline Arrow buffer from stash: %d bytes for jobId=%s", + stashed.length, + jobId, + ); + res.send(stashed); + return; + } + const workspaceClient = getWorkspaceClient(); logger.debug("Processing Arrow job request for jobId=%s", jobId); @@ -247,12 +268,15 @@ export class AnalyticsPlugin extends Plugin { { disposition: "INLINE", format: "ARROW_STREAM" }, signal, ); - // INLINE responses with an Arrow IPC attachment are forwarded as base64 - // for the client to decode into an Arrow Table. Anything else (rare: - // data_array under ARROW_STREAM, or an empty result) falls back to the - // generic "result" payload. + // INLINE responses carry the Arrow IPC bytes as a base64 `attachment`. + // Stash them server-side and emit the same `{type:"arrow", statement_id}` + // shape that EXTERNAL_LINKS uses, so the client fetches the bytes via + // the existing /arrow-result/:jobId path. This keeps SSE messages + // small and unifies the wire protocol. if (result?.attachment) { - return { type: "arrow_inline", attachment: result.attachment }; + const buffer = Buffer.from(result.attachment, "base64"); + const statement_id = this.inlineStash.put(buffer); + return { type: "arrow", statement_id }; } return { type: "result", ...result }; } catch (err: unknown) { diff --git a/packages/appkit/src/plugins/analytics/inline-arrow-stash.ts b/packages/appkit/src/plugins/analytics/inline-arrow-stash.ts new file mode 100644 index 000000000..72aec4dd4 --- /dev/null +++ b/packages/appkit/src/plugins/analytics/inline-arrow-stash.ts @@ -0,0 +1,101 @@ +import { randomUUID } from "node:crypto"; + +/** + * Bounded TTL stash of inline Arrow IPC payloads, keyed by a synthetic + * statement ID prefixed with `inline-`. + * + * The analytics route puts each ARROW_STREAM + INLINE response into the stash + * and emits `{ type: "arrow", statement_id: }` over SSE. The + * client fetches the bytes via the existing `/arrow-result/:jobId` endpoint, + * which checks this stash first and only delegates to the warehouse fetch + * when the ID is not stashed (i.e., a real EXTERNAL_LINKS statement). + * + * Decoupling bulk bytes from the SSE channel keeps `streamDefaults.maxEventSize` + * small (control messages stay small), at the cost of in-process memory for + * the duration of the round trip. + * + * Bounds: + * - per-entry size: enforced upstream by the connector (`MAX_INLINE_ATTACHMENT_BYTES`). + * - max entries: LRU-evict when full. + * - TTL: time after which an unread entry is dropped. + * + * Reads are one-shot — `take()` removes the entry — because each query has + * exactly one consumer. This bounds peak memory in steady state to roughly + * one entry per active analytics query, not `maxEntries × maxBytes`. + * + * Single-process only. A multi-server deployment would need a shared store + * (e.g. Redis) — see PR description for the limitation. + */ +interface InlineArrowStashOptions { + /** Maximum number of pending entries (LRU eviction beyond this). Default 100. */ + maxEntries?: number; + /** Time in ms before an unread entry is auto-evicted. Default 60_000 (60s). */ + ttlMs?: number; +} + +interface StashEntry { + buffer: Buffer; + expiresAt: number; +} + +export class InlineArrowStash { + private readonly entries = new Map(); + private readonly maxEntries: number; + private readonly ttlMs: number; + + constructor(options: InlineArrowStashOptions = {}) { + this.maxEntries = options.maxEntries ?? 100; + this.ttlMs = options.ttlMs ?? 60_000; + } + + /** Stash an Arrow IPC buffer and return the synthetic statement_id. */ + put(buffer: Buffer): string { + this._evictExpired(); + while (this.entries.size >= this.maxEntries) { + // LRU: oldest insertion order — Map iterates in insertion order. + const oldestKey = this.entries.keys().next().value; + if (oldestKey === undefined) break; + this.entries.delete(oldestKey); + } + const id = `inline-${randomUUID()}`; + this.entries.set(id, { + buffer, + expiresAt: Date.now() + this.ttlMs, + }); + return id; + } + + /** + * Retrieve and remove a stashed buffer. Returns `null` if the id is not in + * the stash, expired, or not prefixed `inline-` (in which case the caller + * should treat it as a real warehouse statement_id). + */ + take(id: string): Buffer | null { + if (!id.startsWith("inline-")) return null; + const entry = this.entries.get(id); + if (!entry) return null; + this.entries.delete(id); + if (entry.expiresAt < Date.now()) return null; + return entry.buffer; + } + + /** Drop expired entries without consuming them. */ + private _evictExpired(): void { + const now = Date.now(); + for (const [id, entry] of this.entries) { + if (entry.expiresAt < now) { + this.entries.delete(id); + } + } + } + + /** For tests/observability. */ + size(): number { + return this.entries.size; + } + + /** For tests. */ + clear(): void { + this.entries.clear(); + } +} diff --git a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts index a8c7821c1..43e8db131 100644 --- a/packages/appkit/src/plugins/analytics/tests/analytics.test.ts +++ b/packages/appkit/src/plugins/analytics/tests/analytics.test.ts @@ -848,7 +848,7 @@ describe("Analytics Plugin", () => { } }); - test("/query/:query_key emits arrow_inline SSE event when ARROW_STREAM INLINE returns an attachment", async () => { + test("/query/:query_key stashes ARROW_STREAM INLINE attachment and emits arrow with synthetic statement_id", async () => { const plugin = new AnalyticsPlugin(config); const { router, getHandler } = createMockRouter(); @@ -857,7 +857,8 @@ describe("Analytics Plugin", () => { isAsUser: false, }); - const fakeAttachment = "BASE64_ARROW_IPC_BYTES"; + // Real-ish base64 — must be valid for `Buffer.from(..., "base64")`. + const fakeAttachment = Buffer.from("ipc-bytes-here").toString("base64"); const executeMock = vi.fn().mockResolvedValue({ result: { attachment: fakeAttachment, row_count: 1 }, }); @@ -880,14 +881,18 @@ describe("Analytics Plugin", () => { disposition: "INLINE", format: "ARROW_STREAM", }); - // SSE payload should use the new arrow_inline message type. + // SSE payload uses the unified `arrow` message with an `inline-` id. const writeCalls = (mockRes.write as any).mock.calls.map( (c: any[]) => c[0] as string, ); const payload = writeCalls.find((s: string) => s.startsWith("data: ")); expect(payload).toBeDefined(); - expect(payload).toContain('"type":"arrow_inline"'); - expect(payload).toContain(`"attachment":"${fakeAttachment}"`); + expect(payload).toContain('"type":"arrow"'); + expect(payload).toMatch(/"statement_id":"inline-/); + // Attachment bytes should NOT be inlined in the SSE message. + expect(payload).not.toContain(fakeAttachment); + // The stash should now hold one entry, ready for /arrow-result fetch. + expect((plugin as any).inlineStash.size()).toBe(1); }); test("/query/:query_key rejects unknown format values with 400", async () => { diff --git a/packages/appkit/src/plugins/analytics/tests/inline-arrow-stash.test.ts b/packages/appkit/src/plugins/analytics/tests/inline-arrow-stash.test.ts new file mode 100644 index 000000000..cd151c196 --- /dev/null +++ b/packages/appkit/src/plugins/analytics/tests/inline-arrow-stash.test.ts @@ -0,0 +1,112 @@ +import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; +import { InlineArrowStash } from "../inline-arrow-stash"; + +describe("InlineArrowStash", () => { + beforeEach(() => { + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + test("returns a fresh inline-prefixed id for each put", () => { + const stash = new InlineArrowStash(); + const id1 = stash.put(Buffer.from("a")); + const id2 = stash.put(Buffer.from("b")); + expect(id1).toMatch(/^inline-/); + expect(id2).toMatch(/^inline-/); + expect(id1).not.toBe(id2); + }); + + test("take() returns the buffer exactly once (one-shot)", () => { + const stash = new InlineArrowStash(); + const buf = Buffer.from([1, 2, 3]); + const id = stash.put(buf); + + const first = stash.take(id); + expect(first).toBeInstanceOf(Buffer); + expect(Array.from(first as Buffer)).toEqual([1, 2, 3]); + + const second = stash.take(id); + expect(second).toBeNull(); + }); + + test("take() returns null for non-existent ids", () => { + const stash = new InlineArrowStash(); + expect(stash.take("inline-does-not-exist")).toBeNull(); + }); + + test("take() returns null for ids without the inline- prefix (treated as warehouse id)", () => { + const stash = new InlineArrowStash(); + // Even after stashing, a take() on a non-inline id is rejected — the + // route handler relies on this to forward un-prefixed ids to the + // warehouse fetch path. + stash.put(Buffer.from("x")); + expect(stash.take("01234567-real-warehouse-statement-id")).toBeNull(); + }); + + test("take() returns null after TTL expires", () => { + const stash = new InlineArrowStash({ ttlMs: 1000 }); + const id = stash.put(Buffer.from("data")); + vi.advanceTimersByTime(1500); + expect(stash.take(id)).toBeNull(); + }); + + test("take() succeeds within TTL", () => { + const stash = new InlineArrowStash({ ttlMs: 1000 }); + const id = stash.put(Buffer.from("data")); + vi.advanceTimersByTime(500); + expect(stash.take(id)).toBeInstanceOf(Buffer); + }); + + test("LRU evicts oldest entry when maxEntries is reached", () => { + const stash = new InlineArrowStash({ maxEntries: 2 }); + const id1 = stash.put(Buffer.from("a")); + const id2 = stash.put(Buffer.from("b")); + const id3 = stash.put(Buffer.from("c")); // forces eviction of id1 + + expect(stash.size()).toBe(2); + expect(stash.take(id1)).toBeNull(); + expect(stash.take(id2)).not.toBeNull(); + expect(stash.take(id3)).not.toBeNull(); + }); + + test("expired entries are dropped on next put (passive cleanup)", () => { + const stash = new InlineArrowStash({ ttlMs: 1000, maxEntries: 5 }); + const id1 = stash.put(Buffer.from("old")); + vi.advanceTimersByTime(1500); + + const id2 = stash.put(Buffer.from("new")); + // The expired id1 entry should have been evicted, so size() reflects + // only the newly-put entry. + expect(stash.size()).toBe(1); + expect(stash.take(id1)).toBeNull(); + expect(stash.take(id2)).not.toBeNull(); + }); + + test("clear() empties the stash", () => { + const stash = new InlineArrowStash(); + stash.put(Buffer.from("a")); + stash.put(Buffer.from("b")); + expect(stash.size()).toBe(2); + stash.clear(); + expect(stash.size()).toBe(0); + }); + + test("ids are unguessable (UUID-backed)", () => { + // Smoke-check: 1000 ids should all be distinct, well-formed. + const stash = new InlineArrowStash({ maxEntries: 1000 }); + const ids = new Set(); + for (let i = 0; i < 1000; i++) { + ids.add(stash.put(Buffer.from([i & 0xff]))); + } + expect(ids.size).toBe(1000); + for (const id of ids) { + // inline-: 7-char prefix + 36-char UUID + expect(id).toMatch( + /^inline-[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/, + ); + } + }); +}); diff --git a/packages/appkit/src/stream/defaults.ts b/packages/appkit/src/stream/defaults.ts index 9212ebca3..3938e5849 100644 --- a/packages/appkit/src/stream/defaults.ts +++ b/packages/appkit/src/stream/defaults.ts @@ -1,12 +1,10 @@ export const streamDefaults = { bufferSize: 100, - // 8 MiB. Sized to fit base64-encoded inline Arrow IPC attachments from - // serverless warehouses (analytics queries typically return well under 1 MiB, - // but ARROW_STREAM + INLINE can carry up to ~25 MiB per the Databricks API). - // The connector enforces the same cap (`MAX_INLINE_ATTACHMENT_BYTES`) so - // anything that would exceed this fails fast at the connector with a clear - // error rather than a confusing SSE buffer-exceeded. - maxEventSize: 8 * 1024 * 1024, + // 1 MiB. SSE carries control messages (statement_id, status, errors) and + // small JSON_ARRAY result payloads. Bulk Arrow data is delivered out of + // band via the /arrow-result/:jobId endpoint, backed by InlineArrowStash + // for INLINE responses. This keeps SSE memory pressure bounded. + maxEventSize: 1024 * 1024, bufferTTL: 10 * 60 * 1000, // 10 minutes cleanupInterval: 5 * 60 * 1000, // 5 minutes maxPersistentBuffers: 10000, // 10000 buffers