databricks · jamesbroadhead · Apr 15, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/packages/appkit/package.json b/packages/appkit/package.json
@@ -69,6 +69,7 @@
     "@opentelemetry/sdk-trace-base": "2.6.0",
     "@opentelemetry/semantic-conventions": "1.38.0",
     "@types/semver": "7.7.1",
+    "apache-arrow": "21.1.0",
     "dotenv": "16.6.1",
     "express": "4.22.0",
     "obug": "2.1.1",

diff --git a/packages/appkit/src/type-generator/query-registry.ts b/packages/appkit/src/type-generator/query-registry.ts
@@ -1,6 +1,7 @@
 import fs from "node:fs/promises";
 import path from "node:path";
 import { WorkspaceClient } from "@databricks/sdk-experimental";
+import { tableFromIPC } from "apache-arrow";
 import pc from "picocolors";
 import { createLogger } from "../logging/logger";
 import { CACHE_VERSION, hashSQL, loadCache, saveCache } from "./cache";
@@ -129,18 +130,69 @@ function formatParametersType(sql: string): string {
     : "Record<string, never>";
 }
 
+/**
+ * Decode a base64 Arrow IPC attachment from a DESCRIBE QUERY response and
+ * extract column metadata. Returns the same shape as rows parsed from the
+ * legacy data_array path.
+ *
+ * IMPORTANT: a DESCRIBE QUERY response is itself a result *table* with rows
+ * shaped like `(col_name, data_type, comment)` describing the user query's
+ * output schema. We must read those rows — NOT `table.schema.fields`, which
+ * would describe DESCRIBE QUERY's own output (`col_name`, `data_type`,
+ * `comment`) and yield bogus types for every query.
+ */
+function columnsFromArrowAttachment(
+  attachment: string,
+): Array<{ name: string; type_name: string; comment: string | undefined }> {
+  const buf = Buffer.from(attachment, "base64");
+  const table = tableFromIPC(buf);
+  return table.toArray().map((row) => {
+    const obj = row.toJSON() as {
+      col_name?: unknown;
+      data_type?: unknown;
+      comment?: unknown;
+    };
+    return {
+      name: typeof obj.col_name === "string" ? obj.col_name : "",
+      type_name:
+        typeof obj.data_type === "string"
+          ? obj.data_type.toUpperCase()
+          : "STRING",
+      comment:
+        typeof obj.comment === "string" && obj.comment !== ""
+          ? obj.comment
+          : undefined,
+    };
+  });
+}
+
 export function convertToQueryType(
   result: DatabricksStatementExecutionResponse,
   sql: string,
   queryName: string,
 ): { type: string; hasResults: boolean } {
   const dataRows = result.result?.data_array || [];
-  const columns = dataRows.map((row) => ({
+  let columns = dataRows.map((row) => ({
     name: row[0] || "",
     type_name: row[1]?.toUpperCase() || "STRING",
     comment: row[2] || undefined,
   }));
 
+  // Fallback: serverless warehouses return ARROW_STREAM format with an inline
+  // base64 attachment instead of data_array. Decode the Arrow IPC rows (the
+  // DESCRIBE QUERY result table) to extract column names and types.
+  if (columns.length === 0 && result.result?.attachment) {
+    logger.debug("data_array empty, decoding Arrow IPC attachment for schema");
+    try {
+      columns = columnsFromArrowAttachment(result.result.attachment);
+    } catch (err) {
+      logger.warn(
+        "Failed to decode Arrow IPC attachment: %s",
+        err instanceof Error ? err.message : String(err),
+      );
+    }
+  }
+
   const paramsType = formatParametersType(sql);
 
   // generate result fields with JSDoc
@@ -397,10 +449,11 @@ export async function generateQueriesFromDescribe(
       );
 
       logger.debug(
-        "DESCRIBE result for %s: state=%s, rows=%d",
+        "DESCRIBE result for %s: state=%s, rows=%d, hasAttachment=%s",
         queryName,
         result.status.state,
         result.result?.data_array?.length ?? 0,
+        !!result.result?.attachment,
       );
 
       if (result.status.state === "FAILED") {

diff --git a/packages/appkit/src/type-generator/tests/query-registry.test.ts b/packages/appkit/src/type-generator/tests/query-registry.test.ts
@@ -1,3 +1,4 @@
+import { Table, tableToIPC, vectorFromArray } from "apache-arrow";
 import { describe, expect, test } from "vitest";
 import {
   convertToQueryType,
@@ -11,6 +12,20 @@ import {
 } from "../query-registry";
 import type { DatabricksStatementExecutionResponse } from "../types";
 
+// Build a base64 Arrow IPC payload that mimics a DESCRIBE QUERY response —
+// a result *table* with columns (col_name, data_type, comment) describing
+// the user query's output schema.
+function describeQueryAttachment(
+  rows: Array<{ col_name: string; data_type: string; comment: string | null }>,
+): string {
+  const table = new Table({
+    col_name: vectorFromArray(rows.map((r) => r.col_name)),
+    data_type: vectorFromArray(rows.map((r) => r.data_type)),
+    comment: vectorFromArray(rows.map((r) => r.comment ?? "")),
+  });
+  return Buffer.from(tableToIPC(table, "stream")).toString("base64");
+}
+
 describe("normalizeTypeName", () => {
   test("returns simple types unchanged", () => {
     expect(normalizeTypeName("STRING")).toBe("STRING");
@@ -346,6 +361,87 @@ SELECT * FROM users WHERE date = :startDate AND count = :count AND name = :name`
     );
     expect(hasResults).toBe(false);
   });
+
+  describe("ARROW_STREAM attachment fallback (serverless warehouses)", () => {
+    test("decodes column metadata from Arrow IPC data rows, not schema fields", () => {
+      // Critical regression test: it would be a bug to read
+      // `table.schema.fields` here, which would generate types like
+      // { col_name: string; data_type: string; comment: string } for every
+      // query (those are DESCRIBE QUERY's own output columns). We must read
+      // the data rows.
+      const attachment = describeQueryAttachment([
+        { col_name: "user_id", data_type: "BIGINT", comment: null },
+        { col_name: "name", data_type: "STRING", comment: "display name" },
+        { col_name: "active", data_type: "BOOLEAN", comment: null },
+      ]);
+      const response: DatabricksStatementExecutionResponse = {
+        statement_id: "test-arrow",
+        status: { state: "SUCCEEDED" },
+        result: { attachment },
+      };
+
+      const { type, hasResults } = convertToQueryType(
+        response,
+        "SELECT user_id, name, active FROM users",
+        "users",
+      );
+
+      expect(hasResults).toBe(true);
+      // Real query columns appear in the generated type:
+      expect(type).toContain("user_id: number");
+      expect(type).toContain("name: string");
+      expect(type).toContain("active: boolean");
+      // Column comments survive:
+      expect(type).toContain("/** display name");
+      // The DESCRIBE QUERY metadata column names must NOT leak as user types:
+      expect(type).not.toContain("col_name: string");
+      expect(type).not.toContain("data_type: string");
+    });
+
+    test("normalizes lowercase data_type values to uppercase", () => {
+      const attachment = describeQueryAttachment([
+        { col_name: "id", data_type: "int", comment: null },
+      ]);
+      const response: DatabricksStatementExecutionResponse = {
+        statement_id: "test-arrow",
+        status: { state: "SUCCEEDED" },
+        result: { attachment },
+      };
+
+      const { type } = convertToQueryType(response, "SELECT 1", "test");
+      expect(type).toContain("@sqlType INT");
+      expect(type).toContain("id: number");
+    });
+
+    test("prefers data_array over attachment when both are present", () => {
+      const attachment = describeQueryAttachment([
+        { col_name: "from_arrow", data_type: "STRING", comment: null },
+      ]);
+      const response: DatabricksStatementExecutionResponse = {
+        statement_id: "test-both",
+        status: { state: "SUCCEEDED" },
+        result: {
+          data_array: [["from_data_array", "INT", null]],
+          attachment,
+        },
+      };
+
+      const { type } = convertToQueryType(response, "SELECT 1", "test");
+      expect(type).toContain("from_data_array: number");
+      expect(type).not.toContain("from_arrow");
+    });
+
+    test("logs a warning and yields no columns on malformed attachment", () => {
+      const response: DatabricksStatementExecutionResponse = {
+        statement_id: "test-bad",
+        status: { state: "SUCCEEDED" },
+        result: { attachment: "not-valid-arrow-ipc" },
+      };
+
+      const { hasResults } = convertToQueryType(response, "SELECT 1", "test");
+      expect(hasResults).toBe(false);
+    });
+  });
 });
 
 describe("inferParameterTypes", () => {

diff --git a/packages/appkit/src/type-generator/types.ts b/packages/appkit/src/type-generator/types.ts
@@ -12,6 +12,8 @@ export interface DatabricksStatementExecutionResponse {
   };
   result?: {
     data_array?: (string | null)[][];
+    /** Base64-encoded Arrow IPC bytes (returned by serverless warehouses using ARROW_STREAM format) */
+    attachment?: string;
   };
 }
 

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml