Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions packages/appkit/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
"@opentelemetry/sdk-trace-base": "2.6.0",
"@opentelemetry/semantic-conventions": "1.38.0",
"@types/semver": "7.7.1",
"apache-arrow": "21.1.0",
"dotenv": "16.6.1",
"express": "4.22.0",
"obug": "2.1.1",
Expand Down
57 changes: 55 additions & 2 deletions packages/appkit/src/type-generator/query-registry.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import fs from "node:fs/promises";
import path from "node:path";
import { WorkspaceClient } from "@databricks/sdk-experimental";
import { tableFromIPC } from "apache-arrow";
import pc from "picocolors";
import { createLogger } from "../logging/logger";
import { CACHE_VERSION, hashSQL, loadCache, saveCache } from "./cache";
Expand Down Expand Up @@ -129,18 +130,69 @@ function formatParametersType(sql: string): string {
: "Record<string, never>";
}

/**
* Decode a base64 Arrow IPC attachment from a DESCRIBE QUERY response and
* extract column metadata. Returns the same shape as rows parsed from the
* legacy data_array path.
*
* IMPORTANT: a DESCRIBE QUERY response is itself a result *table* with rows
* shaped like `(col_name, data_type, comment)` describing the user query's
* output schema. We must read those rows — NOT `table.schema.fields`, which
* would describe DESCRIBE QUERY's own output (`col_name`, `data_type`,
* `comment`) and yield bogus types for every query.
*/
function columnsFromArrowAttachment(
attachment: string,
): Array<{ name: string; type_name: string; comment: string | undefined }> {
const buf = Buffer.from(attachment, "base64");
const table = tableFromIPC(buf);
return table.toArray().map((row) => {
const obj = row.toJSON() as {
col_name?: unknown;
data_type?: unknown;
comment?: unknown;
};
return {
name: typeof obj.col_name === "string" ? obj.col_name : "",
type_name:
typeof obj.data_type === "string"
? obj.data_type.toUpperCase()
: "STRING",
comment:
typeof obj.comment === "string" && obj.comment !== ""
? obj.comment
: undefined,
};
});
}

export function convertToQueryType(
result: DatabricksStatementExecutionResponse,
sql: string,
queryName: string,
): { type: string; hasResults: boolean } {
const dataRows = result.result?.data_array || [];
const columns = dataRows.map((row) => ({
let columns = dataRows.map((row) => ({
name: row[0] || "",
type_name: row[1]?.toUpperCase() || "STRING",
comment: row[2] || undefined,
}));

// Fallback: serverless warehouses return ARROW_STREAM format with an inline
// base64 attachment instead of data_array. Decode the Arrow IPC rows (the
// DESCRIBE QUERY result table) to extract column names and types.
if (columns.length === 0 && result.result?.attachment) {
logger.debug("data_array empty, decoding Arrow IPC attachment for schema");
try {
columns = columnsFromArrowAttachment(result.result.attachment);
} catch (err) {
logger.warn(
"Failed to decode Arrow IPC attachment: %s",
err instanceof Error ? err.message : String(err),
);
}
}

const paramsType = formatParametersType(sql);

// generate result fields with JSDoc
Expand Down Expand Up @@ -397,10 +449,11 @@ export async function generateQueriesFromDescribe(
);

logger.debug(
"DESCRIBE result for %s: state=%s, rows=%d",
"DESCRIBE result for %s: state=%s, rows=%d, hasAttachment=%s",
queryName,
result.status.state,
result.result?.data_array?.length ?? 0,
!!result.result?.attachment,
);

if (result.status.state === "FAILED") {
Expand Down
96 changes: 96 additions & 0 deletions packages/appkit/src/type-generator/tests/query-registry.test.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import { Table, tableToIPC, vectorFromArray } from "apache-arrow";
import { describe, expect, test } from "vitest";
import {
convertToQueryType,
Expand All @@ -11,6 +12,20 @@ import {
} from "../query-registry";
import type { DatabricksStatementExecutionResponse } from "../types";

// Build a base64 Arrow IPC payload that mimics a DESCRIBE QUERY response —
// a result *table* with columns (col_name, data_type, comment) describing
// the user query's output schema.
function describeQueryAttachment(
rows: Array<{ col_name: string; data_type: string; comment: string | null }>,
): string {
const table = new Table({
col_name: vectorFromArray(rows.map((r) => r.col_name)),
data_type: vectorFromArray(rows.map((r) => r.data_type)),
comment: vectorFromArray(rows.map((r) => r.comment ?? "")),
});
return Buffer.from(tableToIPC(table, "stream")).toString("base64");
}

describe("normalizeTypeName", () => {
test("returns simple types unchanged", () => {
expect(normalizeTypeName("STRING")).toBe("STRING");
Expand Down Expand Up @@ -346,6 +361,87 @@ SELECT * FROM users WHERE date = :startDate AND count = :count AND name = :name`
);
expect(hasResults).toBe(false);
});

describe("ARROW_STREAM attachment fallback (serverless warehouses)", () => {
test("decodes column metadata from Arrow IPC data rows, not schema fields", () => {
// Critical regression test: it would be a bug to read
// `table.schema.fields` here, which would generate types like
// { col_name: string; data_type: string; comment: string } for every
// query (those are DESCRIBE QUERY's own output columns). We must read
// the data rows.
const attachment = describeQueryAttachment([
{ col_name: "user_id", data_type: "BIGINT", comment: null },
{ col_name: "name", data_type: "STRING", comment: "display name" },
{ col_name: "active", data_type: "BOOLEAN", comment: null },
]);
const response: DatabricksStatementExecutionResponse = {
statement_id: "test-arrow",
status: { state: "SUCCEEDED" },
result: { attachment },
};

const { type, hasResults } = convertToQueryType(
response,
"SELECT user_id, name, active FROM users",
"users",
);

expect(hasResults).toBe(true);
// Real query columns appear in the generated type:
expect(type).toContain("user_id: number");
expect(type).toContain("name: string");
expect(type).toContain("active: boolean");
// Column comments survive:
expect(type).toContain("/** display name");
// The DESCRIBE QUERY metadata column names must NOT leak as user types:
expect(type).not.toContain("col_name: string");
expect(type).not.toContain("data_type: string");
});

test("normalizes lowercase data_type values to uppercase", () => {
const attachment = describeQueryAttachment([
{ col_name: "id", data_type: "int", comment: null },
]);
const response: DatabricksStatementExecutionResponse = {
statement_id: "test-arrow",
status: { state: "SUCCEEDED" },
result: { attachment },
};

const { type } = convertToQueryType(response, "SELECT 1", "test");
expect(type).toContain("@sqlType INT");
expect(type).toContain("id: number");
});

test("prefers data_array over attachment when both are present", () => {
const attachment = describeQueryAttachment([
{ col_name: "from_arrow", data_type: "STRING", comment: null },
]);
const response: DatabricksStatementExecutionResponse = {
statement_id: "test-both",
status: { state: "SUCCEEDED" },
result: {
data_array: [["from_data_array", "INT", null]],
attachment,
},
};

const { type } = convertToQueryType(response, "SELECT 1", "test");
expect(type).toContain("from_data_array: number");
expect(type).not.toContain("from_arrow");
});

test("logs a warning and yields no columns on malformed attachment", () => {
const response: DatabricksStatementExecutionResponse = {
statement_id: "test-bad",
status: { state: "SUCCEEDED" },
result: { attachment: "not-valid-arrow-ipc" },
};

const { hasResults } = convertToQueryType(response, "SELECT 1", "test");
expect(hasResults).toBe(false);
});
});
});

describe("inferParameterTypes", () => {
Expand Down
2 changes: 2 additions & 0 deletions packages/appkit/src/type-generator/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ export interface DatabricksStatementExecutionResponse {
};
result?: {
data_array?: (string | null)[][];
/** Base64-encoded Arrow IPC bytes (returned by serverless warehouses using ARROW_STREAM format) */
attachment?: string;
};
}

Expand Down
6 changes: 5 additions & 1 deletion pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading