diff --git a/docs/users/configuration/model-providers.md b/docs/users/configuration/model-providers.md index 8257dcdf0c..023f5d4e4a 100644 --- a/docs/users/configuration/model-providers.md +++ b/docs/users/configuration/model-providers.md @@ -64,6 +64,9 @@ This auth type supports not only OpenAI's official API but also any OpenAI-compa "maxRetries": 3, "enableCacheControl": true, "contextWindowSize": 128000, + "modalities": { + "image": true + }, "customHeaders": { "X-Client-Request-ID": "req-123" }, @@ -275,7 +278,7 @@ export VLLM_API_KEY="not-needed" ``` > [!note] -> +> > The `extra_body` parameter is **only supported for OpenAI-compatible providers** (`openai`, `qwen-oauth`). It is ignored for Anthropic, Gemini, and Vertex AI providers. ## Bailian Coding Plan @@ -388,7 +391,7 @@ The effective auth/model/credential values are chosen per field using the follow \*When present, CLI auth flags override settings. Otherwise, `security.auth.selectedType` or the implicit default determine the auth type. Qwen OAuth and OpenAI are the only auth types surfaced without extra configuration. > [!warning] -> +> > **Deprecation of `security.auth.apiKey` and `security.auth.baseUrl`:** Directly configuring API credentials via `security.auth.apiKey` and `security.auth.baseUrl` in `settings.json` is deprecated. These settings were used in historical versions for credentials entered through the UI, but the credential input flow was removed in version 0.10.1. These fields will be fully removed in a future release. **It is strongly recommended to migrate to `modelProviders`** for all model and credential configurations. Use `envKey` in `modelProviders` to reference environment variables for secure credential management instead of hardcoding credentials in settings files. ## Generation Config Layering: The Impermeable Provider Layer @@ -522,7 +525,7 @@ The snapshot: ## Selection Persistence and Recommendations > [!important] -> +> > Define `modelProviders` in the user-scope `~/.qwen/settings.json` whenever possible and avoid persisting credential overrides in any scope. Keeping the provider catalog in user settings prevents merge/override conflicts between project and user scopes and ensures `/auth` and `/model` updates always write back to a consistent scope. - `/model` and `/auth` persist `model.name` (where applicable) and `security.auth.selectedType` to the closest writable scope that already defines `modelProviders`; otherwise they fall back to the user scope. This keeps workspace/user files in sync with the active provider catalog. diff --git a/docs/users/configuration/settings.md b/docs/users/configuration/settings.md index 82db2b3190..53f6a11c4c 100644 --- a/docs/users/configuration/settings.md +++ b/docs/users/configuration/settings.md @@ -125,18 +125,18 @@ Settings are organized into categories. All settings should be placed within the #### model -| Setting | Type | Description | Default | -| -------------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------- | -| `model.name` | string | The Qwen model to use for conversations. | `undefined` | -| `model.maxSessionTurns` | number | Maximum number of user/model/tool turns to keep in a session. -1 means unlimited. | `-1` | -| `model.summarizeToolOutput` | object | Enables or disables the summarization of tool output. You can specify the token budget for the summarization using the `tokenBudget` setting. Note: Currently only the `run_shell_command` tool is supported. For example `{"run_shell_command": {"tokenBudget": 2000}}` | `undefined` | -| `model.generationConfig` | object | Advanced overrides passed to the underlying content generator. Supports request controls such as `timeout`, `maxRetries`, `enableCacheControl`, `contextWindowSize` (override model's context window size), `customHeaders` (custom HTTP headers for API requests), and `extra_body` (additional body parameters for OpenAI-compatible API requests only), along with fine-tuning knobs under `samplingParams` (for example `temperature`, `top_p`, `max_tokens`). Leave unset to rely on provider defaults. | `undefined` | -| `model.chatCompression.contextPercentageThreshold` | number | Sets the threshold for chat history compression as a percentage of the model's total token limit. This is a value between 0 and 1 that applies to both automatic compression and the manual `/compress` command. For example, a value of `0.6` will trigger compression when the chat history exceeds 60% of the token limit. Use `0` to disable compression entirely. | `0.7` | -| `model.skipNextSpeakerCheck` | boolean | Skip the next speaker check. | `false` | -| `model.skipLoopDetection` | boolean | Disables loop detection checks. Loop detection prevents infinite loops in AI responses but can generate false positives that interrupt legitimate workflows. Enable this option if you experience frequent false positive loop detection interruptions. | `false` | -| `model.skipStartupContext` | boolean | Skips sending the startup workspace context (environment summary and acknowledgement) at the beginning of each session. Enable this if you prefer to provide context manually or want to save tokens on startup. | `false` | -| `model.enableOpenAILogging` | boolean | Enables logging of OpenAI API calls for debugging and analysis. When enabled, API requests and responses are logged to JSON files. | `false` | -| `model.openAILoggingDir` | string | Custom directory path for OpenAI API logs. If not specified, defaults to `logs/openai` in the current working directory. Supports absolute paths, relative paths (resolved from current working directory), and `~` expansion (home directory). | `undefined` | +| Setting | Type | Description | Default | +| -------------------------------------------------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- | +| `model.name` | string | The Qwen model to use for conversations. | `undefined` | +| `model.maxSessionTurns` | number | Maximum number of user/model/tool turns to keep in a session. -1 means unlimited. | `-1` | +| `model.summarizeToolOutput` | object | Enables or disables the summarization of tool output. You can specify the token budget for the summarization using the `tokenBudget` setting. Note: Currently only the `run_shell_command` tool is supported. For example `{"run_shell_command": {"tokenBudget": 2000}}` | `undefined` | +| `model.generationConfig` | object | Advanced overrides passed to the underlying content generator. Supports request controls such as `timeout`, `maxRetries`, `enableCacheControl`, `contextWindowSize` (override model's context window size), `modalities` (override auto-detected input modalities), `customHeaders` (custom HTTP headers for API requests), and `extra_body` (additional body parameters for OpenAI-compatible API requests only), along with fine-tuning knobs under `samplingParams` (for example `temperature`, `top_p`, `max_tokens`). Leave unset to rely on provider defaults. | `undefined` | +| `model.chatCompression.contextPercentageThreshold` | number | Sets the threshold for chat history compression as a percentage of the model's total token limit. This is a value between 0 and 1 that applies to both automatic compression and the manual `/compress` command. For example, a value of `0.6` will trigger compression when the chat history exceeds 60% of the token limit. Use `0` to disable compression entirely. | `0.7` | +| `model.skipNextSpeakerCheck` | boolean | Skip the next speaker check. | `false` | +| `model.skipLoopDetection` | boolean | Disables loop detection checks. Loop detection prevents infinite loops in AI responses but can generate false positives that interrupt legitimate workflows. Enable this option if you experience frequent false positive loop detection interruptions. | `false` | +| `model.skipStartupContext` | boolean | Skips sending the startup workspace context (environment summary and acknowledgement) at the beginning of each session. Enable this if you prefer to provide context manually or want to save tokens on startup. | `false` | +| `model.enableOpenAILogging` | boolean | Enables logging of OpenAI API calls for debugging and analysis. When enabled, API requests and responses are logged to JSON files. | `false` | +| `model.openAILoggingDir` | string | Custom directory path for OpenAI API logs. If not specified, defaults to `logs/openai` in the current working directory. Supports absolute paths, relative paths (resolved from current working directory), and `~` expansion (home directory). | `undefined` | **Example model.generationConfig:** @@ -146,6 +146,9 @@ Settings are organized into categories. All settings should be placed within the "generationConfig": { "timeout": 60000, "contextWindowSize": 128000, + "modalities": { + "image": true + }, "enableCacheControl": true, "customHeaders": { "X-Client-Request-ID": "req-123" @@ -167,6 +170,10 @@ Settings are organized into categories. All settings should be placed within the Overrides the default context window size for the selected model. Qwen Code determines the context window using built-in defaults based on model name matching, with a constant fallback value. Use this setting when a provider's effective context limit differs from Qwen Code's default. This value defines the model's assumed maximum context capacity, not a per-request token limit. +**modalities:** + +Overrides the auto-detected input modalities for the selected model. Qwen Code automatically detects supported modalities (image, PDF, audio, video) based on model name pattern matching. Use this setting when the auto-detection is incorrect — for example, to enable `pdf` for a model that supports it but isn't recognized. Format: `{ "image": true, "pdf": true, "audio": true, "video": true }`. Omit a key or set it to `false` for unsupported types. + **customHeaders:** Allows you to add custom HTTP headers to all API requests. This is useful for request tracing, monitoring, API gateway routing, or when different models require different headers. If `customHeaders` is defined in `modelProviders[].generationConfig.customHeaders`, it will be used directly; otherwise, headers from `model.generationConfig.customHeaders` will be used. No merging occurs between the two levels. diff --git a/packages/cli/src/i18n/locales/de.js b/packages/cli/src/i18n/locales/de.js index 8ae18e16e4..e7399b15ce 100644 --- a/packages/cli/src/i18n/locales/de.js +++ b/packages/cli/src/i18n/locales/de.js @@ -1034,6 +1034,17 @@ export default { '(default)': '(Standard)', '(set)': '(gesetzt)', '(not set)': '(nicht gesetzt)', + Modality: 'Modalität', + 'Context Window': 'Kontextfenster', + text: 'Text', + 'text-only': 'nur Text', + image: 'Bild', + pdf: 'PDF', + audio: 'Audio', + video: 'Video', + 'not set': 'nicht gesetzt', + none: 'keine', + unknown: 'unbekannt', "Failed to switch model to '{{modelId}}'.\n\n{{error}}": "Modell konnte nicht auf '{{modelId}}' umgestellt werden.\n\n{{error}}", 'Qwen 3.5 Plus — efficient hybrid model with leading coding performance': diff --git a/packages/cli/src/i18n/locales/en.js b/packages/cli/src/i18n/locales/en.js index 0d3d422a70..a73a3067f6 100644 --- a/packages/cli/src/i18n/locales/en.js +++ b/packages/cli/src/i18n/locales/en.js @@ -1021,6 +1021,17 @@ export default { '(default)': '(default)', '(set)': '(set)', '(not set)': '(not set)', + Modality: 'Modality', + 'Context Window': 'Context Window', + text: 'text', + 'text-only': 'text-only', + image: 'image', + pdf: 'pdf', + audio: 'audio', + video: 'video', + 'not set': 'not set', + none: 'none', + unknown: 'unknown', "Failed to switch model to '{{modelId}}'.\n\n{{error}}": "Failed to switch model to '{{modelId}}'.\n\n{{error}}", 'Qwen 3.5 Plus — efficient hybrid model with leading coding performance': diff --git a/packages/cli/src/i18n/locales/ja.js b/packages/cli/src/i18n/locales/ja.js index 9632d5675f..cca360cd7c 100644 --- a/packages/cli/src/i18n/locales/ja.js +++ b/packages/cli/src/i18n/locales/ja.js @@ -731,6 +731,17 @@ export default { // Dialogs - Model 'Select Model': 'モデルを選択', '(Press Esc to close)': '(Esc で閉じる)', + Modality: 'モダリティ', + 'Context Window': 'コンテキストウィンドウ', + text: 'テキスト', + 'text-only': 'テキストのみ', + image: '画像', + pdf: 'PDF', + audio: '音声', + video: '動画', + 'not set': '未設定', + none: 'なし', + unknown: '不明', 'Qwen 3.5 Plus — efficient hybrid model with leading coding performance': 'Qwen 3.5 Plus — 効率的なハイブリッドモデル、業界トップクラスのコーディング性能', 'The latest Qwen Vision model from Alibaba Cloud ModelStudio (version: qwen3-vl-plus-2025-09-23)': diff --git a/packages/cli/src/i18n/locales/pt.js b/packages/cli/src/i18n/locales/pt.js index d630879d1a..b58195b68c 100644 --- a/packages/cli/src/i18n/locales/pt.js +++ b/packages/cli/src/i18n/locales/pt.js @@ -1037,6 +1037,17 @@ export default { '(default)': '(padrão)', '(set)': '(definido)', '(not set)': '(não definido)', + Modality: 'Modalidade', + 'Context Window': 'Janela de Contexto', + text: 'texto', + 'text-only': 'somente texto', + image: 'imagem', + pdf: 'PDF', + audio: 'áudio', + video: 'vídeo', + 'not set': 'não definido', + none: 'nenhum', + unknown: 'desconhecido', "Failed to switch model to '{{modelId}}'.\n\n{{error}}": "Falha ao trocar o modelo para '{{modelId}}'.\n\n{{error}}", 'Qwen 3.5 Plus — efficient hybrid model with leading coding performance': diff --git a/packages/cli/src/i18n/locales/ru.js b/packages/cli/src/i18n/locales/ru.js index b8b332b769..90a021de70 100644 --- a/packages/cli/src/i18n/locales/ru.js +++ b/packages/cli/src/i18n/locales/ru.js @@ -1036,6 +1036,17 @@ export default { '(default)': '(по умолчанию)', '(set)': '(установлено)', '(not set)': '(не задано)', + Modality: 'Модальность', + 'Context Window': 'Контекстное окно', + text: 'текст', + 'text-only': 'только текст', + image: 'изображение', + pdf: 'PDF', + audio: 'аудио', + video: 'видео', + 'not set': 'не задано', + none: 'нет', + unknown: 'неизвестно', "Failed to switch model to '{{modelId}}'.\n\n{{error}}": "Не удалось переключиться на модель '{{modelId}}'.\n\n{{error}}", 'Qwen 3.5 Plus — efficient hybrid model with leading coding performance': diff --git a/packages/cli/src/i18n/locales/zh.js b/packages/cli/src/i18n/locales/zh.js index 02ae707b63..208bdc6766 100644 --- a/packages/cli/src/i18n/locales/zh.js +++ b/packages/cli/src/i18n/locales/zh.js @@ -961,6 +961,17 @@ export default { '(default)': '(默认)', '(set)': '(已设置)', '(not set)': '(未设置)', + Modality: '模态', + 'Context Window': '上下文窗口', + text: '文本', + 'text-only': '纯文本', + image: '图像', + pdf: 'PDF', + audio: '音频', + video: '视频', + 'not set': '未设置', + none: '无', + unknown: '未知', "Failed to switch model to '{{modelId}}'.\n\n{{error}}": "无法切换到模型 '{{modelId}}'.\n\n{{error}}", 'Qwen 3.5 Plus — efficient hybrid model with leading coding performance': diff --git a/packages/cli/src/ui/components/AppHeader.tsx b/packages/cli/src/ui/components/AppHeader.tsx index ba044d10de..0254a2012a 100644 --- a/packages/cli/src/ui/components/AppHeader.tsx +++ b/packages/cli/src/ui/components/AppHeader.tsx @@ -5,16 +5,43 @@ */ import { Box } from 'ink'; -import { Header } from './Header.js'; +import { AuthType } from '@qwen-code/qwen-code-core'; +import { Header, AuthDisplayType } from './Header.js'; import { Tips } from './Tips.js'; import { useSettings } from '../contexts/SettingsContext.js'; import { useConfig } from '../contexts/ConfigContext.js'; import { useUIState } from '../contexts/UIStateContext.js'; +import { isCodingPlanConfig } from '../../constants/codingPlan.js'; interface AppHeaderProps { version: string; } +/** + * Determine the auth display type based on auth type and configuration. + */ +function getAuthDisplayType( + authType?: AuthType, + baseUrl?: string, + apiKeyEnvKey?: string, +): AuthDisplayType { + if (!authType) { + return AuthDisplayType.UNKNOWN; + } + + // Check if it's a Coding Plan config + if (isCodingPlanConfig(baseUrl, apiKeyEnvKey)) { + return AuthDisplayType.CODING_PLAN; + } + + switch (authType) { + case AuthType.QWEN_OAUTH: + return AuthDisplayType.QWEN_OAUTH; + default: + return AuthDisplayType.API_KEY; + } +} + export const AppHeader = ({ version }: AppHeaderProps) => { const settings = useSettings(); const config = useConfig(); @@ -27,12 +54,18 @@ export const AppHeader = ({ version }: AppHeaderProps) => { const showBanner = !config.getScreenReader(); const showTips = !(settings.merged.ui?.hideTips || config.getScreenReader()); + const authDisplayType = getAuthDisplayType( + authType, + contentGeneratorConfig?.baseUrl, + contentGeneratorConfig?.apiKeyEnvKey, + ); + return ( {showBanner && (
diff --git a/packages/cli/src/ui/components/Header.test.tsx b/packages/cli/src/ui/components/Header.test.tsx index 1d3a4d7f19..99bb053da6 100644 --- a/packages/cli/src/ui/components/Header.test.tsx +++ b/packages/cli/src/ui/components/Header.test.tsx @@ -6,8 +6,7 @@ import { render } from 'ink-testing-library'; import { describe, it, expect, vi, beforeEach } from 'vitest'; -import { AuthType } from '@qwen-code/qwen-code-core'; -import { Header } from './Header.js'; +import { Header, AuthDisplayType } from './Header.js'; import * as useTerminalSize from '../hooks/useTerminalSize.js'; vi.mock('../hooks/useTerminalSize.js'); @@ -15,86 +14,70 @@ const useTerminalSizeMock = vi.mocked(useTerminalSize.useTerminalSize); const defaultProps = { version: '1.0.0', - authType: AuthType.QWEN_OAUTH, + authDisplayType: AuthDisplayType.QWEN_OAUTH, model: 'qwen-coder-plus', workingDirectory: '/home/user/projects/test', }; describe('
', () => { beforeEach(() => { - // Default to wide terminal (shows both logo and info panel) useTerminalSizeMock.mockReturnValue({ columns: 120, rows: 24 }); }); it('renders the ASCII logo on wide terminal', () => { const { lastFrame } = render(
); - // Check that parts of the shortAsciiLogo are rendered expect(lastFrame()).toContain('██╔═══██╗'); }); it('hides the ASCII logo on narrow terminal', () => { useTerminalSizeMock.mockReturnValue({ columns: 60, rows: 24 }); const { lastFrame } = render(
); - // Should not contain the logo but still show the info panel expect(lastFrame()).not.toContain('██╔═══██╗'); expect(lastFrame()).toContain('>_ Qwen Code'); }); - it('renders custom ASCII art when provided on wide terminal', () => { - const customArt = 'CUSTOM ART'; - const { lastFrame } = render( -
, - ); - expect(lastFrame()).toContain(customArt); - }); - it('displays the version number', () => { const { lastFrame } = render(
); expect(lastFrame()).toContain('v1.0.0'); }); - it('displays Qwen Code title with >_ prefix', () => { - const { lastFrame } = render(
); - expect(lastFrame()).toContain('>_ Qwen Code'); - }); - it('displays auth type and model', () => { const { lastFrame } = render(
); expect(lastFrame()).toContain('Qwen OAuth'); expect(lastFrame()).toContain('qwen-coder-plus'); }); - it('displays working directory', () => { - const { lastFrame } = render(
); - expect(lastFrame()).toContain('/home/user/projects/test'); - }); - - it('renders a custom working directory display', () => { + it('displays Coding Plan auth type', () => { const { lastFrame } = render( -
, +
, ); - expect(lastFrame()).toContain('custom display'); + expect(lastFrame()).toContain('Coding Plan'); }); - it('displays working directory without branch name', () => { - const { lastFrame } = render(
); - // Branch name is no longer shown in header - expect(lastFrame()).toContain('/home/user/projects/test'); - expect(lastFrame()).not.toContain('(main*)'); + it('displays API Key auth type', () => { + const { lastFrame } = render( +
, + ); + expect(lastFrame()).toContain('API Key'); }); - it('formats home directory with tilde', () => { + it('displays Unknown when auth type is not set', () => { const { lastFrame } = render( -
, +
, ); - // The actual home dir replacement depends on os.homedir() - // Just verify the path is shown - expect(lastFrame()).toContain('projects'); + expect(lastFrame()).toContain('Unknown'); + }); + + it('displays working directory', () => { + const { lastFrame } = render(
); + expect(lastFrame()).toContain('/home/user/projects/test'); }); it('renders with border around info panel', () => { const { lastFrame } = render(
); - // Check for border characters (round border style uses these) expect(lastFrame()).toContain('╭'); expect(lastFrame()).toContain('╯'); }); diff --git a/packages/cli/src/ui/components/Header.tsx b/packages/cli/src/ui/components/Header.tsx index adbe130714..45fce43850 100644 --- a/packages/cli/src/ui/components/Header.tsx +++ b/packages/cli/src/ui/components/Header.tsx @@ -7,59 +7,35 @@ import type React from 'react'; import { Box, Text } from 'ink'; import Gradient from 'ink-gradient'; -import { AuthType, shortenPath, tildeifyPath } from '@qwen-code/qwen-code-core'; +import { shortenPath, tildeifyPath } from '@qwen-code/qwen-code-core'; import { theme } from '../semantic-colors.js'; import { shortAsciiLogo } from './AsciiArt.js'; import { getAsciiArtWidth, getCachedStringWidth } from '../utils/textUtils.js'; import { useTerminalSize } from '../hooks/useTerminalSize.js'; +/** + * Auth display type for the Header component. + * Simplified representation of authentication method shown to users. + */ +export enum AuthDisplayType { + QWEN_OAUTH = 'Qwen OAuth', + CODING_PLAN = 'Coding Plan', + API_KEY = 'API Key', + UNKNOWN = 'Unknown', +} + interface HeaderProps { customAsciiArt?: string; // For user-defined ASCII art version: string; - authType?: AuthType; + authDisplayType?: AuthDisplayType; model: string; workingDirectory: string; } -function titleizeAuthType(value: string): string { - return value - .split(/[-_]/g) - .filter(Boolean) - .map((part) => { - if (part.toLowerCase() === 'ai') { - return 'AI'; - } - return part.charAt(0).toUpperCase() + part.slice(1); - }) - .join(' '); -} - -// Format auth type for display -function formatAuthType(authType?: AuthType): string { - if (!authType) { - return 'Unknown'; - } - - switch (authType) { - case AuthType.QWEN_OAUTH: - return 'Qwen OAuth'; - case AuthType.USE_OPENAI: - return 'OpenAI'; - case AuthType.USE_GEMINI: - return 'Gemini'; - case AuthType.USE_VERTEX_AI: - return 'Vertex AI'; - case AuthType.USE_ANTHROPIC: - return 'Anthropic'; - default: - return titleizeAuthType(String(authType)); - } -} - export const Header: React.FC = ({ customAsciiArt, version, - authType, + authDisplayType, model, workingDirectory, }) => { @@ -67,7 +43,7 @@ export const Header: React.FC = ({ const displayLogo = customAsciiArt ?? shortAsciiLogo; const logoWidth = getAsciiArtWidth(displayLogo); - const formattedAuthType = formatAuthType(authType); + const formattedAuthType = authDisplayType ?? AuthDisplayType.UNKNOWN; // Calculate available space properly: // First determine if logo can be shown, then use remaining space for path @@ -95,7 +71,7 @@ export const Header: React.FC = ({ ? Math.min(availableTerminalWidth - logoWidth - logoGap, maxInfoPanelWidth) : availableTerminalWidth; - // Calculate max path length (subtract padding/borders from available space) + // Calculate max path lengths (subtract padding/borders from available space) const maxPathLength = Math.max( 0, availableInfoPanelWidth - infoPanelChromeWidth, diff --git a/packages/cli/src/ui/components/ModelDialog.test.tsx b/packages/cli/src/ui/components/ModelDialog.test.tsx index 7e05bdc43e..dc5cc108a3 100644 --- a/packages/cli/src/ui/components/ModelDialog.test.tsx +++ b/packages/cli/src/ui/components/ModelDialog.test.tsx @@ -114,10 +114,9 @@ describe('', () => { cleanup(); }); - it('renders the title and help text', () => { + it('renders the title', () => { const { getByText } = renderComponent(); expect(getByText('Select Model')).toBeDefined(); - expect(getByText('(Press Esc to close)')).toBeDefined(); }); it('passes all model options to DescriptiveRadioButtonSelect', () => { @@ -289,11 +288,12 @@ describe('', () => { expect(props.onClose).toHaveBeenCalledTimes(1); }); - it('does not pass onHighlight to DescriptiveRadioButtonSelect', () => { + it('passes onHighlight to DescriptiveRadioButtonSelect', () => { renderComponent(); const childOnHighlight = mockedSelect.mock.calls[0][0].onHighlight; - expect(childOnHighlight).toBeUndefined(); + expect(childOnHighlight).toBeDefined(); + expect(typeof childOnHighlight).toBe('function'); }); it('calls onClose prop when "escape" key is pressed', () => { diff --git a/packages/cli/src/ui/components/ModelDialog.tsx b/packages/cli/src/ui/components/ModelDialog.tsx index 8fdbbe38d2..09723dcddf 100644 --- a/packages/cli/src/ui/components/ModelDialog.tsx +++ b/packages/cli/src/ui/components/ModelDialog.tsx @@ -14,8 +14,7 @@ import { MAINLINE_CODER_MODEL, type AvailableModel as CoreAvailableModel, type ContentGeneratorConfig, - type ContentGeneratorConfigSource, - type ContentGeneratorConfigSources, + type InputModalities, } from '@qwen-code/qwen-code-core'; import { useKeypress } from '../hooks/useKeypress.js'; import { theme } from '../semantic-colors.js'; @@ -26,61 +25,25 @@ import { useSettings } from '../contexts/SettingsContext.js'; import { getPersistScopeForModelSelection } from '../../config/modelProvidersScope.js'; import { t } from '../../i18n/index.js'; -interface ModelDialogProps { - onClose: () => void; +function formatModalities(modalities?: InputModalities): string { + if (!modalities) return t('text-only'); + const parts: string[] = []; + if (modalities.image) parts.push(t('image')); + if (modalities.pdf) parts.push(t('pdf')); + if (modalities.audio) parts.push(t('audio')); + if (modalities.video) parts.push(t('video')); + if (parts.length === 0) return t('text-only'); + return `${t('text')} · ${parts.join(' · ')}`; } -function formatSourceBadge( - source: ContentGeneratorConfigSource | undefined, -): string | undefined { - if (!source) return undefined; - - switch (source.kind) { - case 'cli': - return source.detail ? `CLI ${source.detail}` : 'CLI'; - case 'env': - return source.envKey ? `ENV ${source.envKey}` : 'ENV'; - case 'settings': - return source.settingsPath - ? `Settings ${source.settingsPath}` - : 'Settings'; - case 'modelProviders': { - const suffix = - source.authType && source.modelId - ? `${source.authType}:${source.modelId}` - : source.authType - ? `${source.authType}` - : source.modelId - ? `${source.modelId}` - : ''; - return suffix ? `ModelProviders ${suffix}` : 'ModelProviders'; - } - case 'default': - return source.detail ? `Default ${source.detail}` : 'Default'; - case 'computed': - return source.detail ? `Computed ${source.detail}` : 'Computed'; - case 'programmatic': - return source.detail ? `Programmatic ${source.detail}` : 'Programmatic'; - case 'unknown': - default: - return undefined; - } -} - -function readSourcesFromConfig(config: unknown): ContentGeneratorConfigSources { - if (!config) { - return {}; - } - const maybe = config as { - getContentGeneratorConfigSources?: () => ContentGeneratorConfigSources; - }; - return maybe.getContentGeneratorConfigSources?.() ?? {}; +interface ModelDialogProps { + onClose: () => void; } function maskApiKey(apiKey: string | undefined): string { - if (!apiKey) return '(not set)'; + if (!apiKey) return `(${t('not set')})`; const trimmed = apiKey.trim(); - if (trimmed.length === 0) return '(not set)'; + if (trimmed.length === 0) return `(${t('not set')})`; if (trimmed.length <= 6) return '***'; const head = trimmed.slice(0, 3); const tail = trimmed.slice(-4); @@ -131,7 +94,7 @@ function handleModelSwitchSuccess({ { type: 'info', text: - `authType: ${effectiveAuthType ?? '(none)'}` + + `authType: ${effectiveAuthType ?? `(${t('none')})`}` + `\n` + `Using ${isRuntime ? 'runtime ' : ''}model: ${effectiveModelId}` + `\n` + @@ -143,35 +106,26 @@ function handleModelSwitchSuccess({ ); } -function ConfigRow({ +function formatContextWindow(size?: number): string { + if (!size) return `(${t('unknown')})`; + return `${size.toLocaleString('en-US')} tokens`; +} + +function DetailRow({ label, value, - badge, }: { label: string; value: React.ReactNode; - badge?: string; }): React.JSX.Element { return ( - - - - {label}: - - - {value} - + + + {label}: + + + {value} - {badge ? ( - - - - - - {badge} - - - ) : null} ); } @@ -183,13 +137,9 @@ export function ModelDialog({ onClose }: ModelDialogProps): React.JSX.Element { // Local error state for displaying errors within the dialog const [errorMessage, setErrorMessage] = useState(null); + const [highlightedValue, setHighlightedValue] = useState(null); const authType = config?.getAuthType(); - const effectiveConfig = - (config?.getContentGeneratorConfig?.() as - | ContentGeneratorConfig - | undefined) ?? undefined; - const sources = readSourcesFromConfig(config); const availableModelEntries = useMemo(() => { const allModels = config ? config.getAllConfiguredModels() : []; @@ -319,6 +269,20 @@ export function ModelDialog({ onClose }: ModelDialogProps): React.JSX.Element { return index === -1 ? 0 : index; }, [MODEL_OPTIONS, preferredKey]); + const handleHighlight = useCallback((value: string) => { + setHighlightedValue(value); + }, []); + + const highlightedEntry = useMemo(() => { + const key = highlightedValue ?? preferredKey; + return availableModelEntries.find( + ({ authType: t2, model, isRuntime, snapshotId }) => { + const v = isRuntime && snapshotId ? snapshotId : `${t2}::${model.id}`; + return v === key; + }, + ); + }, [highlightedValue, preferredKey, availableModelEntries]); + const handleSelect = useCallback( async (selected: string) => { setErrorMessage(null); @@ -413,35 +377,6 @@ export function ModelDialog({ onClose }: ModelDialogProps): React.JSX.Element { > {t('Select Model')} - - - {t('Current (effective) configuration')} - - - - - - {authType !== AuthType.QWEN_OAUTH && ( - <> - - - - )} - - - {!hasModels ? ( @@ -465,12 +400,48 @@ export function ModelDialog({ onClose }: ModelDialogProps): React.JSX.Element { )} + {highlightedEntry && ( + + + + + {highlightedEntry.authType !== AuthType.QWEN_OAUTH && ( + <> + + + + )} + + )} + {errorMessage && ( @@ -480,7 +451,9 @@ export function ModelDialog({ onClose }: ModelDialogProps): React.JSX.Element { )} - {t('(Press Esc to close)')} + + {t('Enter to select, ↑↓ to navigate, Esc to close')} + ); diff --git a/packages/core/src/core/contentGenerator.ts b/packages/core/src/core/contentGenerator.ts index f3af06bda2..078729af6f 100644 --- a/packages/core/src/core/contentGenerator.ts +++ b/packages/core/src/core/contentGenerator.ts @@ -60,6 +60,17 @@ export enum AuthType { USE_ANTHROPIC = 'anthropic', } +/** + * Supported input modalities for a model. + * Omitted or false fields mean the model does not support that input type. + */ +export type InputModalities = { + image?: boolean; + pdf?: boolean; + audio?: boolean; + video?: boolean; +}; + export type ContentGeneratorConfig = { model: string; apiKey?: string; @@ -98,6 +109,9 @@ export type ContentGeneratorConfig = { customHeaders?: Record; // Extra body parameters to be merged into the request body extra_body?: Record; + // Supported input modalities. Unsupported media types are replaced with text + // placeholders. Leave undefined to use automatic detection from model name. + modalities?: InputModalities; }; // Keep the public ContentGeneratorConfigSources API, but reuse the generic diff --git a/packages/core/src/core/modalityDefaults.test.ts b/packages/core/src/core/modalityDefaults.test.ts new file mode 100644 index 0000000000..b90bc069e4 --- /dev/null +++ b/packages/core/src/core/modalityDefaults.test.ts @@ -0,0 +1,213 @@ +/** + * @license + * Copyright 2025 Qwen Team + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect } from 'vitest'; +import { defaultModalities } from './modalityDefaults.js'; + +describe('defaultModalities', () => { + describe('Google Gemini', () => { + it('returns full multimodal for gemini-3-pro', () => { + expect(defaultModalities('gemini-3-pro-preview')).toEqual({ + image: true, + pdf: true, + audio: true, + video: true, + }); + }); + + it('returns full multimodal for gemini-3-flash', () => { + expect(defaultModalities('gemini-3-flash-preview')).toEqual({ + image: true, + pdf: true, + audio: true, + video: true, + }); + }); + + it('returns full multimodal for gemini-3.1-pro', () => { + expect(defaultModalities('gemini-3.1-pro-preview')).toEqual({ + image: true, + pdf: true, + audio: true, + video: true, + }); + }); + + it('returns full multimodal for gemini-2.5-pro', () => { + expect(defaultModalities('gemini-2.5-pro')).toEqual({ + image: true, + pdf: true, + audio: true, + video: true, + }); + }); + + it('returns full multimodal for gemini-1.5-flash', () => { + expect(defaultModalities('gemini-1.5-flash')).toEqual({ + image: true, + pdf: true, + audio: true, + video: true, + }); + }); + }); + + describe('OpenAI', () => { + it('returns image for gpt-5.2', () => { + const m = defaultModalities('gpt-5.2'); + expect(m.image).toBe(true); + expect(m.audio).toBeUndefined(); + expect(m.pdf).toBeUndefined(); + expect(m.video).toBeUndefined(); + }); + + it('returns image for gpt-5-mini', () => { + expect(defaultModalities('gpt-5-mini').image).toBe(true); + }); + + it('returns image for gpt-4o', () => { + expect(defaultModalities('gpt-4o').image).toBe(true); + }); + + it('returns image for o3', () => { + expect(defaultModalities('o3').image).toBe(true); + }); + }); + + describe('Anthropic Claude', () => { + it('returns image + pdf for claude-opus-4-6', () => { + const m = defaultModalities('claude-opus-4-6'); + expect(m.image).toBe(true); + expect(m.pdf).toBe(true); + expect(m.audio).toBeUndefined(); + expect(m.video).toBeUndefined(); + }); + + it('returns image + pdf for claude-sonnet-4-6', () => { + const m = defaultModalities('claude-sonnet-4-6'); + expect(m.image).toBe(true); + expect(m.pdf).toBe(true); + }); + + it('returns image + pdf for claude-sonnet-4', () => { + const m = defaultModalities('claude-sonnet-4'); + expect(m.image).toBe(true); + expect(m.pdf).toBe(true); + }); + + it('returns image + pdf for claude-3.5-sonnet', () => { + const m = defaultModalities('claude-3.5-sonnet'); + expect(m.image).toBe(true); + expect(m.pdf).toBe(true); + }); + }); + + describe('Qwen', () => { + it('returns image + video for qwen-vl-max', () => { + const m = defaultModalities('qwen-vl-max'); + expect(m.image).toBe(true); + expect(m.video).toBe(true); + expect(m.pdf).toBeUndefined(); + expect(m.audio).toBeUndefined(); + }); + + it('returns image + video for qwen3-vl-plus', () => { + const m = defaultModalities('qwen3-vl-plus'); + expect(m.image).toBe(true); + expect(m.video).toBe(true); + }); + + it('returns text-only for qwen3-coder-plus', () => { + expect(defaultModalities('qwen3-coder-plus')).toEqual({}); + }); + + it('returns image + video for coder-model (same as qwen3.5-plus)', () => { + expect(defaultModalities('coder-model')).toEqual({ + image: true, + video: true, + }); + }); + + it('returns image + video for qwen3.5-plus', () => { + const m = defaultModalities('qwen3.5-plus'); + expect(m.image).toBe(true); + expect(m.video).toBe(true); + expect(m.pdf).toBeUndefined(); + expect(m.audio).toBeUndefined(); + }); + + it('returns text-only for qwen-turbo', () => { + expect(defaultModalities('qwen-turbo')).toEqual({}); + }); + }); + + describe('DeepSeek', () => { + it('returns text-only for deepseek-chat', () => { + expect(defaultModalities('deepseek-chat')).toEqual({}); + }); + + it('returns text-only for deepseek-reasoner', () => { + expect(defaultModalities('deepseek-reasoner')).toEqual({}); + }); + }); + + describe('Zhipu GLM', () => { + it('returns image for glm-4.5v', () => { + const m = defaultModalities('glm-4.5v'); + expect(m.image).toBe(true); + expect(m.pdf).toBeUndefined(); + }); + + it('returns text-only for glm-5', () => { + expect(defaultModalities('glm-5')).toEqual({}); + }); + + it('returns text-only for glm-4.7', () => { + expect(defaultModalities('glm-4.7')).toEqual({}); + }); + }); + + describe('MiniMax', () => { + it('returns text-only for MiniMax-M2.5', () => { + expect(defaultModalities('MiniMax-M2.5')).toEqual({}); + }); + }); + + describe('Kimi', () => { + it('returns image + video for kimi-k2.5', () => { + const m = defaultModalities('kimi-k2.5'); + expect(m.image).toBe(true); + expect(m.video).toBe(true); + expect(m.pdf).toBeUndefined(); + expect(m.audio).toBeUndefined(); + }); + + it('returns text-only for kimi-k2', () => { + expect(defaultModalities('kimi-k2')).toEqual({}); + }); + }); + + describe('unknown models', () => { + it('returns text-only for unrecognized models', () => { + expect(defaultModalities('some-random-model-xyz')).toEqual({}); + }); + }); + + describe('normalization', () => { + it('normalizes provider prefixes', () => { + expect(defaultModalities('openai/gpt-4o')).toEqual( + defaultModalities('gpt-4o'), + ); + }); + + it('returns a fresh copy each time', () => { + const a = defaultModalities('gemini-2.5-pro'); + const b = defaultModalities('gemini-2.5-pro'); + expect(a).toEqual(b); + expect(a).not.toBe(b); + }); + }); +}); diff --git a/packages/core/src/core/modalityDefaults.ts b/packages/core/src/core/modalityDefaults.ts new file mode 100644 index 0000000000..f17927325e --- /dev/null +++ b/packages/core/src/core/modalityDefaults.ts @@ -0,0 +1,94 @@ +/** + * @license + * Copyright 2025 Qwen Team + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { InputModalities } from './contentGenerator.js'; +import { normalize } from './tokenLimits.js'; + +const FULL_MULTIMODAL: InputModalities = { + image: true, + pdf: true, + audio: true, + video: true, +}; + +/** + * Ordered regex patterns: most specific -> most general (first match wins). + * Default for unknown models is text-only (empty object = all false). + */ +const MODALITY_PATTERNS: Array<[RegExp, InputModalities]> = [ + // ------------------- + // Google Gemini — full multimodal + // ------------------- + [/^gemini-3/, FULL_MULTIMODAL], + [/^gemini-/, FULL_MULTIMODAL], + + // ------------------- + // OpenAI — image by default for all gpt/o-series models + // ------------------- + [/^gpt-5/, { image: true }], + [/^gpt-/, { image: true }], + [/^o\d/, { image: true }], + + // ------------------- + // Anthropic Claude — image + pdf + // ------------------- + [/^claude-/, { image: true, pdf: true }], + + // ------------------- + // Alibaba / Qwen + // ------------------- + // Qwen3.5-Plus: image support + [/^qwen3\.5-plus/, { image: true, video: true }], + [/^coder-model$/, { image: true, video: true }], + + // Qwen VL (vision-language) models: image + video + [/^qwen-vl-/, { image: true, video: true }], + [/^qwen3-vl-/, { image: true, video: true }], + + // Qwen coder / text models: text-only + [/^qwen3-coder-/, {}], + [/^qwen/, {}], + + // ------------------- + // DeepSeek — text-only + // ------------------- + [/^deepseek/, {}], + + // ------------------- + // Zhipu GLM + // ------------------- + [/^glm-4\.5v/, { image: true }], + [/^glm-5(?:-|$)/, {}], + [/^glm-/, {}], + + // ------------------- + // MiniMax — text-only + // ------------------- + [/^minimax-/, {}], + + // ------------------- + // Moonshot / Kimi + // ------------------- + [/^kimi-k2\.5/, { image: true, video: true }], + [/^kimi-/, {}], +]; + +/** + * Return the default input modalities for a model based on its name. + * + * Uses the same normalize-then-regex pattern as {@link tokenLimit}. + * Unknown models default to text-only (empty object) to avoid sending + * unsupported media types that would cause unrecoverable API errors. + */ +export function defaultModalities(model: string): InputModalities { + const norm = normalize(model); + for (const [regex, modalities] of MODALITY_PATTERNS) { + if (regex.test(norm)) { + return { ...modalities }; + } + } + return {}; +} diff --git a/packages/core/src/core/openaiContentGenerator/converter.test.ts b/packages/core/src/core/openaiContentGenerator/converter.test.ts index 36bbc812de..edad4992c4 100644 --- a/packages/core/src/core/openaiContentGenerator/converter.test.ts +++ b/packages/core/src/core/openaiContentGenerator/converter.test.ts @@ -22,7 +22,12 @@ describe('OpenAIContentConverter', () => { let converter: OpenAIContentConverter; beforeEach(() => { - converter = new OpenAIContentConverter('test-model'); + converter = new OpenAIContentConverter('test-model', 'auto', { + image: true, + pdf: true, + audio: true, + video: true, + }); }); describe('resetStreamingToolCalls', () => { @@ -1684,7 +1689,12 @@ describe('MCP tool result end-to-end through OpenAI converter (issue #1520)', () let converter: OpenAIContentConverter; beforeEach(() => { - converter = new OpenAIContentConverter('test-model'); + converter = new OpenAIContentConverter('test-model', 'auto', { + image: true, + pdf: true, + audio: true, + video: true, + }); }); it('should preserve MCP multi-text content in tool message (not leak to user message)', () => { @@ -1957,3 +1967,159 @@ describe('MCP tool result end-to-end through OpenAI converter (issue #1520)', () expect(contentArray[1].image_url?.url).toContain('data:image/png'); }); }); + +describe('modality filtering', () => { + function makeRequest(parts: Part[]): GenerateContentParameters { + return { + model: 'test-model', + contents: [{ role: 'user', parts }], + }; + } + + function getUserContentParts( + messages: OpenAI.Chat.ChatCompletionMessageParam[], + ): Array<{ type: string; text?: string }> { + const userMsg = messages.find((m) => m.role === 'user'); + if ( + !userMsg || + !('content' in userMsg) || + !Array.isArray(userMsg.content) + ) { + return []; + } + return userMsg.content as Array<{ type: string; text?: string }>; + } + + it('replaces image with placeholder when image modality is disabled', () => { + const conv = new OpenAIContentConverter('deepseek-chat', 'auto', {}); + const request = makeRequest([ + { + inlineData: { mimeType: 'image/png', data: 'abc123' }, + displayName: 'screenshot.png', + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(1); + expect(parts[0].type).toBe('text'); + expect(parts[0].text).toContain('image file'); + expect(parts[0].text).toContain('does not support image input'); + }); + + it('keeps image when image modality is enabled', () => { + const conv = new OpenAIContentConverter('gpt-4o', 'auto', { image: true }); + const request = makeRequest([ + { + inlineData: { mimeType: 'image/png', data: 'abc123' }, + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(1); + expect(parts[0].type).toBe('image_url'); + }); + + it('replaces PDF with placeholder when pdf modality is disabled', () => { + const conv = new OpenAIContentConverter('test-model', 'auto', { + image: true, + }); + const request = makeRequest([ + { + inlineData: { + mimeType: 'application/pdf', + data: 'pdf-data', + displayName: 'doc.pdf', + }, + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(1); + expect(parts[0].type).toBe('text'); + expect(parts[0].text).toContain('pdf file'); + expect(parts[0].text).toContain('does not support PDF input'); + }); + + it('keeps PDF when pdf modality is enabled', () => { + const conv = new OpenAIContentConverter('claude-sonnet', 'auto', { + image: true, + pdf: true, + }); + const request = makeRequest([ + { + inlineData: { + mimeType: 'application/pdf', + data: 'pdf-data', + displayName: 'doc.pdf', + }, + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(1); + expect(parts[0].type).toBe('file'); + }); + + it('replaces video with placeholder when video modality is disabled', () => { + const conv = new OpenAIContentConverter('test-model', 'auto', {}); + const request = makeRequest([ + { + inlineData: { mimeType: 'video/mp4', data: 'vid-data' }, + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(1); + expect(parts[0].type).toBe('text'); + expect(parts[0].text).toContain('video file'); + }); + + it('replaces audio with placeholder when audio modality is disabled', () => { + const conv = new OpenAIContentConverter('test-model', 'auto', {}); + const request = makeRequest([ + { + inlineData: { mimeType: 'audio/wav', data: 'audio-data' }, + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(1); + expect(parts[0].type).toBe('text'); + expect(parts[0].text).toContain('audio file'); + }); + + it('handles mixed content: keeps text + supported media, replaces unsupported', () => { + const conv = new OpenAIContentConverter('gpt-4o', 'auto', { image: true }); + const request = makeRequest([ + { text: 'Analyze these files' }, + { + inlineData: { mimeType: 'image/png', data: 'img-data' }, + } as unknown as Part, + { + inlineData: { mimeType: 'video/mp4', data: 'vid-data' }, + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(3); + expect(parts[0].type).toBe('text'); + expect(parts[0].text).toBe('Analyze these files'); + expect(parts[1].type).toBe('image_url'); + expect(parts[2].type).toBe('text'); + expect(parts[2].text).toContain('video file'); + }); + + it('defaults to text-only when no modalities are specified', () => { + const conv = new OpenAIContentConverter('unknown-model'); + const request = makeRequest([ + { + inlineData: { mimeType: 'image/png', data: 'img-data' }, + } as unknown as Part, + ]); + const messages = conv.convertGeminiRequestToOpenAI(request); + const parts = getUserContentParts(messages); + expect(parts).toHaveLength(1); + expect(parts[0].type).toBe('text'); + expect(parts[0].text).toContain('image file'); + }); +}); diff --git a/packages/core/src/core/openaiContentGenerator/converter.ts b/packages/core/src/core/openaiContentGenerator/converter.ts index 2ca7428bdd..bdfc0286ed 100644 --- a/packages/core/src/core/openaiContentGenerator/converter.ts +++ b/packages/core/src/core/openaiContentGenerator/converter.ts @@ -20,12 +20,16 @@ import type { import { GenerateContentResponse, FinishReason } from '@google/genai'; import type OpenAI from 'openai'; import { safeJsonParse } from '../../utils/safeJsonParse.js'; +import { createDebugLogger } from '../../utils/debugLogger.js'; +import type { InputModalities } from '../contentGenerator.js'; import { StreamingToolCallParser } from './streamingToolCallParser.js'; import { convertSchema, type SchemaComplianceMode, } from '../../utils/schemaConverter.js'; +const debugLogger = createDebugLogger('CONVERTER'); + /** * Extended usage type that supports both OpenAI standard format and alternative formats * Some models return cached_tokens at the top level instead of in prompt_tokens_details @@ -92,12 +96,18 @@ type OpenAIContentPart = export class OpenAIContentConverter { private model: string; private schemaCompliance: SchemaComplianceMode; + private modalities: InputModalities; private streamingToolCallParser: StreamingToolCallParser = new StreamingToolCallParser(); - constructor(model: string, schemaCompliance: SchemaComplianceMode = 'auto') { + constructor( + model: string, + schemaCompliance: SchemaComplianceMode = 'auto', + modalities: InputModalities = {}, + ) { this.model = model; this.schemaCompliance = schemaCompliance; + this.modalities = modalities; } /** @@ -108,6 +118,13 @@ export class OpenAIContentConverter { this.model = model; } + /** + * Update the supported input modalities. + */ + setModalities(modalities: InputModalities): void { + this.modalities = modalities; + } + /** * Reset streaming tool calls parser for new stream processing * This should be called at the beginning of each stream to prevent @@ -585,13 +602,19 @@ export class OpenAIContentConverter { } /** - * Create OpenAI media content part from Gemini part + * Create OpenAI media content part from Gemini part. + * Checks modality support before building each media type. */ private createMediaContentPart(part: Part): OpenAIContentPart | null { if (part.inlineData?.mimeType && part.inlineData?.data) { const mimeType = part.inlineData.mimeType; const mediaType = this.getMediaType(mimeType); + const displayName = part.inlineData.displayName || mimeType; + if (mediaType === 'image') { + if (!this.modalities.image) { + return this.unsupportedModalityPlaceholder('image', displayName); + } const dataUrl = `data:${mimeType};base64,${part.inlineData.data}`; return { type: 'image_url' as const, @@ -600,6 +623,9 @@ export class OpenAIContentConverter { } if (mimeType === 'application/pdf') { + if (!this.modalities.pdf) { + return this.unsupportedModalityPlaceholder('pdf', displayName); + } const filename = part.inlineData.displayName || 'document.pdf'; return { type: 'file' as const, @@ -611,6 +637,9 @@ export class OpenAIContentConverter { } if (mediaType === 'audio') { + if (!this.modalities.audio) { + return this.unsupportedModalityPlaceholder('audio', displayName); + } const format = this.getAudioFormat(mimeType); if (format) { return { @@ -624,6 +653,9 @@ export class OpenAIContentConverter { } if (mediaType === 'video') { + if (!this.modalities.video) { + return this.unsupportedModalityPlaceholder('video', displayName); + } return { type: 'video_url' as const, video_url: { @@ -632,12 +664,9 @@ export class OpenAIContentConverter { }; } - const displayName = part.inlineData.displayName - ? ` (${part.inlineData.displayName})` - : ''; return { type: 'text' as const, - text: `Unsupported inline media type: ${mimeType}${displayName}.`, + text: `Unsupported inline media type: ${mimeType} (${displayName}).`, }; } @@ -648,6 +677,9 @@ export class OpenAIContentConverter { const mediaType = this.getMediaType(mimeType); if (mediaType === 'image') { + if (!this.modalities.image) { + return this.unsupportedModalityPlaceholder('image', filename); + } return { type: 'image_url' as const, image_url: { url: fileUri }, @@ -655,6 +687,9 @@ export class OpenAIContentConverter { } if (mimeType === 'application/pdf') { + if (!this.modalities.pdf) { + return this.unsupportedModalityPlaceholder('pdf', filename); + } return { type: 'file' as const, file: { @@ -665,6 +700,9 @@ export class OpenAIContentConverter { } if (mediaType === 'video') { + if (!this.modalities.video) { + return this.unsupportedModalityPlaceholder('video', filename); + } return { type: 'video_url' as const, video_url: { @@ -673,18 +711,42 @@ export class OpenAIContentConverter { }; } - const displayName = part.fileData.displayName + const displayNameStr = part.fileData.displayName ? ` (${part.fileData.displayName})` : ''; return { type: 'text' as const, - text: `Unsupported file media type: ${mimeType}${displayName}.`, + text: `Unsupported file media type: ${mimeType}${displayNameStr}.`, }; } return null; } + /** + * Create a text placeholder for unsupported modalities. + */ + private unsupportedModalityPlaceholder( + modality: string, + displayName: string, + ): OpenAIContentPart { + debugLogger.warn( + `Model '${this.model}' does not support ${modality} input. ` + + `Replacing with text placeholder: ${displayName}`, + ); + let hint: string; + if (modality === 'pdf') { + hint = + 'This model does not support PDF input directly. The read_file tool cannot extract PDF content either. To extract text from the PDF file, try using skills if applicable, or guide user to install pdf skill by running this slash command:\n/extensions install https://github.com/anthropics/skills:document-skills'; + } else { + hint = `This model does not support ${modality} input. The read_file tool cannot process this type of file either. To handle this file, try using skills if applicable, or any tools installed at system wide, or let the user know you cannot process this type of file.`; + } + return { + type: 'text' as const, + text: `[Unsupported ${modality} file: "${displayName}". ${hint}]`, + }; + } + /** * Determine media type from MIME type */ diff --git a/packages/core/src/core/openaiContentGenerator/pipeline.test.ts b/packages/core/src/core/openaiContentGenerator/pipeline.test.ts index 964f768a3d..d71e23e913 100644 --- a/packages/core/src/core/openaiContentGenerator/pipeline.test.ts +++ b/packages/core/src/core/openaiContentGenerator/pipeline.test.ts @@ -47,6 +47,7 @@ describe('ContentGenerationPipeline', () => { // Mock converter mockConverter = { setModel: vi.fn(), + setModalities: vi.fn(), convertGeminiRequestToOpenAI: vi.fn(), convertOpenAIResponseToGemini: vi.fn(), convertOpenAIChunkToGemini: vi.fn(), @@ -104,6 +105,7 @@ describe('ContentGenerationPipeline', () => { expect(OpenAIContentConverter).toHaveBeenCalledWith( 'test-model', undefined, + {}, ); }); }); diff --git a/packages/core/src/core/openaiContentGenerator/pipeline.ts b/packages/core/src/core/openaiContentGenerator/pipeline.ts index 1865adb48c..8d2cc9fc76 100644 --- a/packages/core/src/core/openaiContentGenerator/pipeline.ts +++ b/packages/core/src/core/openaiContentGenerator/pipeline.ts @@ -46,6 +46,7 @@ export class ContentGenerationPipeline { this.converter = new OpenAIContentConverter( this.contentGeneratorConfig.model, this.contentGeneratorConfig.schemaCompliance, + this.contentGeneratorConfig.modalities ?? {}, ); } @@ -58,6 +59,7 @@ export class ContentGenerationPipeline { // that is not valid/available for the OpenAI-compatible backend. const effectiveModel = this.contentGeneratorConfig.model; this.converter.setModel(effectiveModel); + this.converter.setModalities(this.contentGeneratorConfig.modalities ?? {}); return this.executeWithErrorHandling( request, userPromptId, @@ -85,6 +87,7 @@ export class ContentGenerationPipeline { ): Promise> { const effectiveModel = this.contentGeneratorConfig.model; this.converter.setModel(effectiveModel); + this.converter.setModalities(this.contentGeneratorConfig.modalities ?? {}); return this.executeWithErrorHandling( request, userPromptId, diff --git a/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts b/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts index f9d7a0fd6e..2e528120af 100644 --- a/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts +++ b/packages/core/src/core/openaiContentGenerator/provider/dashscope.test.ts @@ -733,7 +733,7 @@ describe('DashScopeOpenAICompatibleProvider', () => { describe('output token limits', () => { it('should limit max_tokens when it exceeds model limit', () => { const request: OpenAI.Chat.ChatCompletionCreateParams = { - model: 'qwen3-coder-plus', + model: 'qwen3-max', messages: [{ role: 'user', content: 'Hello' }], max_tokens: 100000, // Exceeds the model's output limit }; @@ -757,7 +757,7 @@ describe('DashScopeOpenAICompatibleProvider', () => { it('should not modify max_tokens when it is within model limit', () => { const request: OpenAI.Chat.ChatCompletionCreateParams = { - model: 'qwen3-coder-plus', + model: 'qwen3-max', messages: [{ role: 'user', content: 'Hello' }], max_tokens: 1000, // Within the model's output limit }; @@ -769,7 +769,7 @@ describe('DashScopeOpenAICompatibleProvider', () => { it('should not add max_tokens when not present in request', () => { const request: OpenAI.Chat.ChatCompletionCreateParams = { - model: 'qwen3-coder-plus', + model: 'qwen3-max', messages: [{ role: 'user', content: 'Hello' }], // No max_tokens parameter }; @@ -781,7 +781,7 @@ describe('DashScopeOpenAICompatibleProvider', () => { it('should handle null max_tokens parameter', () => { const request: OpenAI.Chat.ChatCompletionCreateParams = { - model: 'qwen3-coder-plus', + model: 'qwen3-max', messages: [{ role: 'user', content: 'Hello' }], max_tokens: null, }; @@ -800,12 +800,12 @@ describe('DashScopeOpenAICompatibleProvider', () => { const result = provider.buildRequest(request, 'test-prompt-id'); - expect(result.max_tokens).toBe(4096); // Should be limited to default output limit (4K) + expect(result.max_tokens).toBe(8192); // Should be limited to default output limit (8K) }); it('should preserve other request parameters when limiting max_tokens', () => { const request: OpenAI.Chat.ChatCompletionCreateParams = { - model: 'qwen3-coder-plus', + model: 'qwen3-max', messages: [{ role: 'user', content: 'Hello' }], max_tokens: 100000, // Will be limited temperature: 0.8, @@ -872,12 +872,10 @@ describe('DashScopeOpenAICompatibleProvider', () => { ], }, ], - max_tokens: 50000, }; const result = provider.buildRequest(request, 'test-prompt-id'); - expect(result.max_tokens).toBe(32768); // Limited to model's output limit (32K) expect( (result as { vl_high_resolution_images?: boolean }) .vl_high_resolution_images, @@ -904,8 +902,7 @@ describe('DashScopeOpenAICompatibleProvider', () => { const result = provider.buildRequest(request, 'test-prompt-id'); - // coder-model has 64K output limit, so max_tokens should be capped - expect(result.max_tokens).toBe(65536); + expect(result.max_tokens).toBe(65536); // Limited to model's output limit (64K) expect( (result as { vl_high_resolution_images?: boolean }) .vl_high_resolution_images, @@ -914,7 +911,7 @@ describe('DashScopeOpenAICompatibleProvider', () => { it('should handle streaming requests with output token limits', () => { const request: OpenAI.Chat.ChatCompletionCreateParams = { - model: 'qwen3-coder-plus', + model: 'qwen3-max', messages: [{ role: 'user', content: 'Hello' }], max_tokens: 100000, // Exceeds the model's output limit stream: true, diff --git a/packages/core/src/core/openaiContentGenerator/provider/deepseek.test.ts b/packages/core/src/core/openaiContentGenerator/provider/deepseek.test.ts index 68693393b0..9a69cd3269 100644 --- a/packages/core/src/core/openaiContentGenerator/provider/deepseek.test.ts +++ b/packages/core/src/core/openaiContentGenerator/provider/deepseek.test.ts @@ -5,7 +5,6 @@ */ import { describe, it, expect, vi, beforeEach } from 'vitest'; -import type OpenAI from 'openai'; import { DeepSeekOpenAICompatibleProvider } from './deepseek.js'; import type { ContentGeneratorConfig } from '../../contentGenerator.js'; import type { Config } from '../../../config/config.js'; @@ -18,7 +17,6 @@ vi.mock('openai', () => ({ })); describe('DeepSeekOpenAICompatibleProvider', () => { - let provider: DeepSeekOpenAICompatibleProvider; let mockContentGeneratorConfig: ContentGeneratorConfig; let mockCliConfig: Config; @@ -34,11 +32,6 @@ describe('DeepSeekOpenAICompatibleProvider', () => { mockCliConfig = { getCliVersion: vi.fn().mockReturnValue('1.0.0'), } as unknown as Config; - - provider = new DeepSeekOpenAICompatibleProvider( - mockContentGeneratorConfig, - mockCliConfig, - ); }); describe('isDeepSeekProvider', () => { @@ -61,72 +54,15 @@ describe('DeepSeekOpenAICompatibleProvider', () => { }); }); - describe('buildRequest', () => { - const userPromptId = 'prompt-123'; - - it('converts array content into a string', () => { - const originalRequest: OpenAI.Chat.ChatCompletionCreateParams = { - model: 'deepseek-chat', - messages: [ - { - role: 'user', - content: [ - { type: 'text', text: 'Hello' }, - { type: 'text', text: ' world' }, - ], - }, - ], - }; - - const result = provider.buildRequest(originalRequest, userPromptId); - - expect(result.messages).toHaveLength(1); - expect(result.messages?.[0]).toEqual({ - role: 'user', - content: 'Hello world', + describe('getDefaultGenerationConfig', () => { + it('returns temperature 0', () => { + const provider = new DeepSeekOpenAICompatibleProvider( + mockContentGeneratorConfig, + mockCliConfig, + ); + expect(provider.getDefaultGenerationConfig()).toEqual({ + temperature: 0, }); - expect(originalRequest.messages?.[0].content).toEqual([ - { type: 'text', text: 'Hello' }, - { type: 'text', text: ' world' }, - ]); - }); - - it('leaves string content unchanged', () => { - const originalRequest: OpenAI.Chat.ChatCompletionCreateParams = { - model: 'deepseek-chat', - messages: [ - { - role: 'user', - content: 'Hello world', - }, - ], - }; - - const result = provider.buildRequest(originalRequest, userPromptId); - - expect(result.messages?.[0].content).toBe('Hello world'); - }); - - it('throws when encountering non-text multimodal parts', () => { - const originalRequest: OpenAI.Chat.ChatCompletionCreateParams = { - model: 'deepseek-chat', - messages: [ - { - role: 'user', - content: [ - { type: 'text', text: 'Hello' }, - { - type: 'image_url', - image_url: { url: 'https://example.com/image.png' }, - }, - ], - }, - ], - }; - - expect(() => - provider.buildRequest(originalRequest, userPromptId), - ).toThrow(/only supports text content/i); }); }); }); diff --git a/packages/core/src/core/openaiContentGenerator/provider/deepseek.ts b/packages/core/src/core/openaiContentGenerator/provider/deepseek.ts index 9b5fd7479d..0e246725fd 100644 --- a/packages/core/src/core/openaiContentGenerator/provider/deepseek.ts +++ b/packages/core/src/core/openaiContentGenerator/provider/deepseek.ts @@ -4,7 +4,6 @@ * SPDX-License-Identifier: Apache-2.0 */ -import type OpenAI from 'openai'; import type { Config } from '../../../config/config.js'; import type { ContentGeneratorConfig } from '../../contentGenerator.js'; import { DefaultOpenAICompatibleProvider } from './default.js'; @@ -26,58 +25,6 @@ export class DeepSeekOpenAICompatibleProvider extends DefaultOpenAICompatiblePro return baseUrl.toLowerCase().includes('api.deepseek.com'); } - override buildRequest( - request: OpenAI.Chat.ChatCompletionCreateParams, - userPromptId: string, - ): OpenAI.Chat.ChatCompletionCreateParams { - const baseRequest = super.buildRequest(request, userPromptId); - if (!baseRequest.messages?.length) { - return baseRequest; - } - - const messages = baseRequest.messages.map((message) => { - if (!('content' in message)) { - return message; - } - - const { content } = message; - - if ( - typeof content === 'string' || - content === null || - content === undefined - ) { - return message; - } - - if (!Array.isArray(content)) { - return message; - } - - const text = content - .map((part) => { - if (part.type !== 'text') { - throw new Error( - `DeepSeek provider only supports text content. Found non-text part of type '${part.type}' in message with role '${message.role}'.`, - ); - } - - return part.text ?? ''; - }) - .join(''); - - return { - ...message, - content: text, - } as OpenAI.Chat.ChatCompletionMessageParam; - }); - - return { - ...baseRequest, - messages, - }; - } - override getDefaultGenerationConfig(): GenerateContentConfig { return { temperature: 0, diff --git a/packages/core/src/core/tokenLimits.test.ts b/packages/core/src/core/tokenLimits.test.ts index ffd71cd4be..edea10a100 100644 --- a/packages/core/src/core/tokenLimits.test.ts +++ b/packages/core/src/core/tokenLimits.test.ts @@ -91,183 +91,143 @@ describe('normalize', () => { }); describe('tokenLimit', () => { - // Test cases for each model family describe('Google Gemini', () => { - it('should return the correct limit for Gemini 1.5 Pro', () => { - expect(tokenLimit('gemini-1.5-pro')).toBe(2097152); + it('should return 1M for Gemini 3.x (latest)', () => { + expect(tokenLimit('gemini-3-pro-preview')).toBe(1000000); + expect(tokenLimit('gemini-3-flash-preview')).toBe(1000000); + expect(tokenLimit('gemini-3.1-pro-preview')).toBe(1000000); }); - it('should return the correct limit for Gemini 1.5 Flash', () => { - expect(tokenLimit('gemini-1.5-flash')).toBe(1048576); - }); - it('should return the correct limit for Gemini 2.5 Pro', () => { - expect(tokenLimit('gemini-2.5-pro')).toBe(1048576); - }); - it('should return the correct limit for Gemini 2.5 Flash', () => { - expect(tokenLimit('gemini-2.5-flash')).toBe(1048576); - }); - it('should return the correct limit for Gemini 2.0 Flash with image generation', () => { - expect(tokenLimit('gemini-2.0-flash-image-generation')).toBe(32768); - }); - it('should return the correct limit for Gemini 2.0 Flash', () => { - expect(tokenLimit('gemini-2.0-flash')).toBe(1048576); + + it('should return 1M for legacy Gemini (fallback)', () => { + expect(tokenLimit('gemini-2.5-pro')).toBe(1000000); + expect(tokenLimit('gemini-2.5-flash')).toBe(1000000); + expect(tokenLimit('gemini-2.0-flash')).toBe(1000000); + expect(tokenLimit('gemini-1.5-pro')).toBe(1000000); + expect(tokenLimit('gemini-1.5-flash')).toBe(1000000); }); }); describe('OpenAI', () => { - it('should return the correct limit for o3-mini', () => { - expect(tokenLimit('o3-mini')).toBe(200000); + it('should return 400K for GPT-5.x (latest)', () => { + expect(tokenLimit('gpt-5')).toBe(400000); + expect(tokenLimit('gpt-5-mini')).toBe(400000); + expect(tokenLimit('gpt-5.2')).toBe(400000); + expect(tokenLimit('gpt-5.2-pro')).toBe(400000); }); - it('should return the correct limit for o3 models', () => { - expect(tokenLimit('o3')).toBe(200000); - }); - it('should return the correct limit for o4-mini', () => { - expect(tokenLimit('o4-mini')).toBe(200000); - }); - it('should return the correct limit for gpt-4o-mini', () => { - expect(tokenLimit('gpt-4o-mini')).toBe(131072); - }); - it('should return the correct limit for gpt-4o', () => { + + it('should return 128K for legacy GPT (fallback)', () => { expect(tokenLimit('gpt-4o')).toBe(131072); - }); - it('should return the correct limit for gpt-4.1-mini', () => { - expect(tokenLimit('gpt-4.1-mini')).toBe(1048576); - }); - it('should return the correct limit for gpt-4.1 models', () => { - expect(tokenLimit('gpt-4.1')).toBe(1048576); - }); - it('should return the correct limit for gpt-4', () => { + expect(tokenLimit('gpt-4o-mini')).toBe(131072); + expect(tokenLimit('gpt-4.1')).toBe(131072); expect(tokenLimit('gpt-4')).toBe(131072); }); + + it('should return 200K for o-series', () => { + expect(tokenLimit('o3')).toBe(200000); + expect(tokenLimit('o3-mini')).toBe(200000); + expect(tokenLimit('o4-mini')).toBe(200000); + }); }); describe('Anthropic Claude', () => { - it('should return the correct limit for Claude 3.5 Sonnet', () => { + it('should return 200K for all Claude models', () => { + expect(tokenLimit('claude-opus-4-6')).toBe(200000); + expect(tokenLimit('claude-sonnet-4-6')).toBe(200000); + expect(tokenLimit('claude-sonnet-4')).toBe(200000); + expect(tokenLimit('claude-opus-4')).toBe(200000); expect(tokenLimit('claude-3.5-sonnet')).toBe(200000); - }); - it('should return the correct limit for Claude 3.7 Sonnet', () => { - expect(tokenLimit('claude-3.7-sonnet')).toBe(1048576); - }); - it('should return the correct limit for Claude Sonnet 4', () => { - expect(tokenLimit('claude-sonnet-4')).toBe(1048576); - }); - it('should return the correct limit for Claude Opus 4', () => { - expect(tokenLimit('claude-opus-4')).toBe(1048576); + expect(tokenLimit('claude-3.7-sonnet')).toBe(200000); }); }); describe('Alibaba Qwen', () => { - it('should return the correct limit for qwen3-coder commercial models', () => { - expect(tokenLimit('qwen3-coder-plus')).toBe(1048576); - expect(tokenLimit('qwen3-coder-plus-20250601')).toBe(1048576); - expect(tokenLimit('qwen3-coder-flash')).toBe(1048576); - expect(tokenLimit('qwen3-coder-flash-20250601')).toBe(1048576); - }); - - it('should return the correct limit for qwen3-coder open source models', () => { + it('should return 1M for commercial Qwen3 models', () => { + expect(tokenLimit('qwen3-coder-plus')).toBe(1000000); + expect(tokenLimit('qwen3-coder-plus-20250601')).toBe(1000000); + expect(tokenLimit('qwen3-coder-flash')).toBe(1000000); + expect(tokenLimit('qwen3.5-plus')).toBe(1000000); + expect(tokenLimit('coder-model')).toBe(1000000); + }); + + it('should return 256K for Qwen3 non-commercial models', () => { + expect(tokenLimit('qwen3-max')).toBe(262144); + expect(tokenLimit('qwen3-max-2026-01-23')).toBe(262144); + expect(tokenLimit('qwen3-vl-plus')).toBe(262144); expect(tokenLimit('qwen3-coder-7b')).toBe(262144); - expect(tokenLimit('qwen3-coder-480b-a35b-instruct')).toBe(262144); - expect(tokenLimit('qwen3-coder-30b-a3b-instruct')).toBe(262144); - }); - - it('should return the correct limit for qwen3 2507 variants', () => { - expect(tokenLimit('qwen3-some-model-2507-instruct')).toBe(262144); - }); - - it('should return the correct limit for qwen2.5-1m', () => { - expect(tokenLimit('qwen2.5-1m')).toBe(1048576); - expect(tokenLimit('qwen2.5-1m-instruct')).toBe(1048576); + expect(tokenLimit('qwen3-coder-next')).toBe(262144); }); - it('should return the correct limit for qwen2.5', () => { - expect(tokenLimit('qwen2.5')).toBe(131072); - expect(tokenLimit('qwen2.5-instruct')).toBe(131072); + it('should return 1M for studio latest models', () => { + expect(tokenLimit('qwen-plus-latest')).toBe(1000000); + expect(tokenLimit('qwen-flash-latest')).toBe(1000000); }); - it('should return the correct limit for qwen-plus', () => { - expect(tokenLimit('qwen-plus-latest')).toBe(1048576); - expect(tokenLimit('qwen-plus')).toBe(131072); - }); - - it('should return the correct limit for qwen-flash', () => { - expect(tokenLimit('qwen-flash-latest')).toBe(1048576); - }); - - it('should return the correct limit for qwen-turbo', () => { - expect(tokenLimit('qwen-turbo')).toBe(131072); - expect(tokenLimit('qwen-turbo-latest')).toBe(131072); + it('should return 256K for Qwen fallback', () => { + expect(tokenLimit('qwen-plus')).toBe(262144); + expect(tokenLimit('qwen-turbo')).toBe(262144); + expect(tokenLimit('qwen2.5')).toBe(262144); + expect(tokenLimit('qwen-vl-max-latest')).toBe(262144); }); }); - describe('ByteDance Seed-OSS', () => { - it('should return the correct limit for seed-oss', () => { - expect(tokenLimit('seed-oss')).toBe(524288); + describe('DeepSeek', () => { + it('should return 128K for DeepSeek models', () => { + expect(tokenLimit('deepseek-r1')).toBe(131072); + expect(tokenLimit('deepseek-v3')).toBe(131072); + expect(tokenLimit('deepseek-chat')).toBe(131072); }); }); describe('Zhipu GLM', () => { - it('should return the correct limit for glm-4.5v', () => { - expect(tokenLimit('glm-4.5v')).toBe(65536); - }); - it('should return the correct limit for glm-4.5-air', () => { - expect(tokenLimit('glm-4.5-air')).toBe(131072); - }); - it('should return the correct limit for glm-4.5', () => { - expect(tokenLimit('glm-4.5')).toBe(131072); + it('should return 200K for GLM-5 and GLM-4.7 (latest)', () => { + expect(tokenLimit('glm-5')).toBe(202752); + expect(tokenLimit('glm-4.7')).toBe(202752); }); - it('should return the correct limit for glm-4.6', () => { - expect(tokenLimit('glm-4.6')).toBe(202752); + + it('should return 200K for legacy GLM (fallback)', () => { + expect(tokenLimit('glm-4.5')).toBe(202752); + expect(tokenLimit('glm-4.5v')).toBe(202752); + expect(tokenLimit('glm-4.5-air')).toBe(202752); }); }); - describe('DeepSeek', () => { - it('should return the correct limit for deepseek-r1', () => { - expect(tokenLimit('deepseek-r1')).toBe(131072); - }); - it('should return the correct limit for deepseek-v3', () => { - expect(tokenLimit('deepseek-v3')).toBe(131072); + describe('MiniMax', () => { + it('should return 1M for MiniMax-M2.5 (latest)', () => { + expect(tokenLimit('MiniMax-M2.5')).toBe(1000000); }); - it('should return the correct limit for deepseek-v3.1', () => { - expect(tokenLimit('deepseek-v3.1')).toBe(131072); - }); - it('should return the correct limit for deepseek-v3.2', () => { - expect(tokenLimit('deepseek-v3.2-exp')).toBe(131072); + + it('should return 200K for MiniMax fallback', () => { + expect(tokenLimit('MiniMax-M2.1')).toBe(200000); }); }); describe('Moonshot Kimi', () => { - it('should return the correct limit for kimi-k2 variants', () => { - expect(tokenLimit('kimi-k2-0905-preview')).toBe(262144); // 256K + it('should return 256K for Kimi models', () => { + expect(tokenLimit('kimi-k2.5')).toBe(262144); expect(tokenLimit('kimi-k2-0905')).toBe(262144); - expect(tokenLimit('kimi-k2-turbo-preview')).toBe(262144); expect(tokenLimit('kimi-k2-turbo')).toBe(262144); - expect(tokenLimit('kimi-k2-0711-preview')).toBe(262144); - expect(tokenLimit('kimi-k2-instruct')).toBe(262144); }); }); describe('Other models', () => { - it('should return the correct limit for gpt-oss', () => { - expect(tokenLimit('gpt-oss')).toBe(131072); - }); - it('should return the correct limit for llama-4-scout', () => { - expect(tokenLimit('llama-4-scout')).toBe(10485760); + it('should return correct limits for other known models', () => { + expect(tokenLimit('seed-oss')).toBe(524288); }); - it('should return the correct limit for mistral-large-2', () => { - expect(tokenLimit('mistral-large-2')).toBe(131072); + + it('should return the default token limit for unknown models', () => { + expect(tokenLimit('llama-4-scout')).toBe(DEFAULT_TOKEN_LIMIT); }); }); - // Test for default limit it('should return the default token limit for an unknown model', () => { expect(tokenLimit('unknown-model-v1.0')).toBe(DEFAULT_TOKEN_LIMIT); + expect(tokenLimit('mistral-large-2')).toBe(DEFAULT_TOKEN_LIMIT); }); - // Test with complex model string it('should return the correct limit for a complex model string', () => { expect(tokenLimit(' a/b/c|GPT-4o:gpt-4o-2024-05-13-q4 ')).toBe(131072); }); - // Test case-insensitive matching it('should handle case-insensitive model names', () => { expect(tokenLimit('GPT-4O')).toBe(131072); expect(tokenLimit('CLAUDE-3.5-SONNET')).toBe(200000); @@ -275,99 +235,97 @@ describe('tokenLimit', () => { }); describe('tokenLimit with output type', () => { - describe('Qwen models with output limits', () => { - it('should return the correct output limit for qwen3-coder-plus', () => { - expect(tokenLimit('qwen3-coder-plus', 'output')).toBe(65536); - expect(tokenLimit('qwen3-coder-plus-20250601', 'output')).toBe(65536); + describe('latest models output limits', () => { + it('should return correct output limits for GPT-5.x', () => { + expect(tokenLimit('gpt-5.2', 'output')).toBe(131072); + expect(tokenLimit('gpt-5-mini', 'output')).toBe(131072); }); - it('should return the correct output limit for qwen-vl-max-latest', () => { - expect(tokenLimit('qwen-vl-max-latest', 'output')).toBe(8192); + it('should return correct output limits for Gemini 3.x', () => { + expect(tokenLimit('gemini-3-pro-preview', 'output')).toBe(65536); + expect(tokenLimit('gemini-3-flash-preview', 'output')).toBe(65536); + }); + + it('should return correct output limits for Claude 4.6', () => { + expect(tokenLimit('claude-opus-4-6', 'output')).toBe(131072); + expect(tokenLimit('claude-sonnet-4-6', 'output')).toBe(65536); }); }); - describe('Default output limits', () => { - it('should return the default output limit for unknown models', () => { - expect(tokenLimit('unknown-model', 'output')).toBe( - DEFAULT_OUTPUT_TOKEN_LIMIT, - ); - expect(tokenLimit('gpt-4', 'output')).toBe(DEFAULT_OUTPUT_TOKEN_LIMIT); - expect(tokenLimit('claude-3.5-sonnet', 'output')).toBe( - DEFAULT_OUTPUT_TOKEN_LIMIT, - ); + describe('legacy model output fallbacks', () => { + it('should return fallback output limits for legacy GPT', () => { + expect(tokenLimit('gpt-4o', 'output')).toBe(16384); }); - it('should return the default output limit for models without specific output patterns', () => { - expect(tokenLimit('qwen3-coder-7b', 'output')).toBe( - DEFAULT_OUTPUT_TOKEN_LIMIT, - ); - expect(tokenLimit('qwen-plus', 'output')).toBe( - DEFAULT_OUTPUT_TOKEN_LIMIT, - ); - expect(tokenLimit('qwen-vl-max', 'output')).toBe( - DEFAULT_OUTPUT_TOKEN_LIMIT, - ); + it('should return fallback output limits for legacy Gemini', () => { + expect(tokenLimit('gemini-2.5-pro', 'output')).toBe(8192); + }); + + it('should return fallback output limits for legacy Claude', () => { + expect(tokenLimit('claude-sonnet-4', 'output')).toBe(65536); + expect(tokenLimit('claude-opus-4', 'output')).toBe(65536); + }); + }); + + describe('Qwen output limits', () => { + it('should return correct output limits for Qwen models', () => { + expect(tokenLimit('qwen3.5-plus', 'output')).toBe(65536); + expect(tokenLimit('qwen3-max', 'output')).toBe(65536); + expect(tokenLimit('qwen3-max-2026-01-23', 'output')).toBe(65536); + expect(tokenLimit('coder-model', 'output')).toBe(65536); + // Models without specific output limits fall back to default + expect(tokenLimit('qwen3-coder-plus', 'output')).toBe(8192); + expect(tokenLimit('qwen3-coder-next', 'output')).toBe(8192); + expect(tokenLimit('qwen3-vl-plus', 'output')).toBe(8192); + expect(tokenLimit('qwen-vl-max-latest', 'output')).toBe(8192); }); }); - describe('Input vs Output limits comparison', () => { - it('should return different limits for input vs output for qwen3-coder-plus', () => { - expect(tokenLimit('qwen3-coder-plus', 'input')).toBe(1048576); // 1M input - expect(tokenLimit('qwen3-coder-plus', 'output')).toBe(65536); // 64K output + describe('other output limits', () => { + it('should return correct output limits for DeepSeek', () => { + expect(tokenLimit('deepseek-reasoner', 'output')).toBe(65536); + expect(tokenLimit('deepseek-chat', 'output')).toBe(8192); }); - it('should return different limits for input vs output for qwen-vl-max-latest', () => { - expect(tokenLimit('qwen-vl-max-latest', 'input')).toBe(131072); // 128K input - expect(tokenLimit('qwen-vl-max-latest', 'output')).toBe(8192); // 8K output + it('should return correct output limits for GLM', () => { + expect(tokenLimit('glm-5', 'output')).toBe(16384); + expect(tokenLimit('glm-4.7', 'output')).toBe(16384); }); - it('should return different limits for input vs output for qwen3-vl-plus', () => { - expect(tokenLimit('qwen3-vl-plus', 'input')).toBe(262144); // 256K input - expect(tokenLimit('qwen3-vl-plus', 'output')).toBe(32768); // 32K output + it('should return correct output limits for MiniMax', () => { + expect(tokenLimit('MiniMax-M2.5', 'output')).toBe(65536); }); - it('should return same default limits for unknown models', () => { - expect(tokenLimit('unknown-model', 'input')).toBe(DEFAULT_TOKEN_LIMIT); // 128K input + it('should return correct output limits for Kimi', () => { + expect(tokenLimit('kimi-k2.5', 'output')).toBe(32768); + }); + }); + + describe('default output limits', () => { + it('should return the default output limit for unknown models', () => { expect(tokenLimit('unknown-model', 'output')).toBe( DEFAULT_OUTPUT_TOKEN_LIMIT, - ); // 4K output + ); }); }); - describe('Backward compatibility', () => { - it('should default to input type when no type is specified', () => { - expect(tokenLimit('qwen3-coder-plus')).toBe(1048576); // Should be input limit - expect(tokenLimit('qwen-vl-max-latest')).toBe(131072); // Should be input limit - expect(tokenLimit('unknown-model')).toBe(DEFAULT_TOKEN_LIMIT); // Should be input default + describe('input vs output comparison', () => { + it('should return different limits for input vs output', () => { + expect(tokenLimit('qwen3-max', 'input')).toBe(262144); + expect(tokenLimit('qwen3-max', 'output')).toBe(65536); }); - it('should work with explicit input type', () => { - expect(tokenLimit('qwen3-coder-plus', 'input')).toBe(1048576); - expect(tokenLimit('qwen-vl-max-latest', 'input')).toBe(131072); - expect(tokenLimit('unknown-model', 'input')).toBe(DEFAULT_TOKEN_LIMIT); + it('should default to input type when no type is specified', () => { + expect(tokenLimit('qwen3-coder-plus')).toBe(1000000); + expect(tokenLimit('unknown-model')).toBe(DEFAULT_TOKEN_LIMIT); }); }); - describe('Model normalization with output limits', () => { + describe('normalization with output limits', () => { it('should handle normalized model names for output limits', () => { - expect(tokenLimit('QWEN3-CODER-PLUS', 'output')).toBe(65536); - expect(tokenLimit('qwen3-coder-plus-20250601', 'output')).toBe(65536); + expect(tokenLimit('QWEN3-MAX', 'output')).toBe(65536); + expect(tokenLimit('qwen3-max-20250601', 'output')).toBe(65536); expect(tokenLimit('QWEN-VL-MAX-LATEST', 'output')).toBe(8192); }); - - it('should handle complex model strings for output limits', () => { - expect( - tokenLimit( - ' a/b/c|QWEN3-CODER-PLUS:qwen3-coder-plus-2024-05-13 ', - 'output', - ), - ).toBe(65536); - expect( - tokenLimit( - 'provider/qwen-vl-max-latest:qwen-vl-max-latest-v1', - 'output', - ), - ).toBe(8192); - }); }); }); diff --git a/packages/core/src/core/tokenLimits.ts b/packages/core/src/core/tokenLimits.ts index 2419e51a1b..d038133cb2 100644 --- a/packages/core/src/core/tokenLimits.ts +++ b/packages/core/src/core/tokenLimits.ts @@ -9,23 +9,23 @@ type TokenCount = number; export type TokenLimitType = 'input' | 'output'; export const DEFAULT_TOKEN_LIMIT: TokenCount = 131_072; // 128K (power-of-two) -export const DEFAULT_OUTPUT_TOKEN_LIMIT: TokenCount = 4_096; // 4K tokens +export const DEFAULT_OUTPUT_TOKEN_LIMIT: TokenCount = 8_192; // 8K tokens /** * Accurate numeric limits: * - power-of-two approximations (128K -> 131072, 256K -> 262144, etc.) - * - vendor-declared exact values (e.g., 200k -> 200000) are used as stated in docs. + * - vendor-declared exact values (e.g., 200k -> 200000, 1m -> 1000000) are + * used as stated in docs. */ const LIMITS = { '32k': 32_768, '64k': 65_536, '128k': 131_072, - '200k': 200_000, // vendor-declared decimal, used by OpenAI, Anthropic, GLM etc. + '200k': 200_000, // vendor-declared decimal, used by OpenAI, Anthropic, etc. '256k': 262_144, + '400k': 400_000, // vendor-declared decimal, used by OpenAI GPT-5.x '512k': 524_288, - '1m': 1_048_576, - '2m': 2_097_152, - '10m': 10_485_760, // 10 million tokens + '1m': 1_000_000, // Output token limits (typically much smaller than input limits) '4k': 4_096, '8k': 8_192, @@ -81,110 +81,64 @@ const PATTERNS: Array<[RegExp, TokenCount]> = [ // ------------------- // Google Gemini // ------------------- - [/^gemini-1\.5-pro$/, LIMITS['2m']], - [/^gemini-1\.5-flash$/, LIMITS['1m']], - [/^gemini-2\.5-pro.*$/, LIMITS['1m']], - [/^gemini-2\.5-flash.*$/, LIMITS['1m']], - [/^gemini-2\.0-flash-image-generation$/, LIMITS['32k']], - [/^gemini-2\.0-flash.*$/, LIMITS['1m']], + [/^gemini-3/, LIMITS['1m']], // Gemini 3.x (Pro, Flash, 3.1, etc.): 1M + [/^gemini-/, LIMITS['1m']], // Gemini fallback (1.5, 2.x): 1M // ------------------- - // OpenAI (o3 / o4-mini / gpt-4.1 / gpt-4o family) - // o3 and o4-mini document a 200,000-token context window (decimal). - // Note: GPT-4.1 models typically report 1_048_576 (1M) context in OpenAI announcements. - [/^o3(?:-mini|$).*$/, LIMITS['200k']], - [/^o3.*$/, LIMITS['200k']], - [/^o4-mini.*$/, LIMITS['200k']], - [/^gpt-4\.1-mini.*$/, LIMITS['1m']], - [/^gpt-4\.1.*$/, LIMITS['1m']], - [/^gpt-4o-mini.*$/, LIMITS['128k']], - [/^gpt-4o.*$/, LIMITS['128k']], - [/^gpt-4.*$/, LIMITS['128k']], + // OpenAI + // ------------------- + [/^gpt-5/, LIMITS['400k']], // GPT-5.x: 400K + [/^gpt-/, LIMITS['128k']], // GPT fallback (4o, 4.1, etc.): 128K + [/^o\d/, LIMITS['200k']], // o-series (o3, o4-mini, etc.): 200K // ------------------- // Anthropic Claude - // - Claude Sonnet / Sonnet 3.5 and related Sonnet variants: 200,000 tokens documented. - // - Some Sonnet/Opus models offer 1M in beta/enterprise tiers (handled separately if needed). - [/^claude-3\.5-sonnet.*$/, LIMITS['200k']], - [/^claude-3\.7-sonnet.*$/, LIMITS['1m']], // some Sonnet 3.7/Opus variants advertise 1M beta in docs - [/^claude-sonnet-4.*$/, LIMITS['1m']], - [/^claude-opus-4.*$/, LIMITS['1m']], + // ------------------- + [/^claude-/, LIMITS['200k']], // All Claude models: 200K // ------------------- // Alibaba / Qwen // ------------------- - // Commercial Qwen3-Coder-Plus: 1M token context - [/^qwen3-coder-plus(-.*)?$/, LIMITS['1m']], // catches "qwen3-coder-plus" and date variants - - // Commercial Qwen3-Coder-Flash: 1M token context - [/^qwen3-coder-flash(-.*)?$/, LIMITS['1m']], // catches "qwen3-coder-flash" and date variants - - // Commercial Qwen3.5-Plus: 1M token context - [/^qwen3\.5-plus(-.*)?$/, LIMITS['1m']], // catches "qwen3.5-plus" and date variants - - // Generic coder-model: same as qwen3.5-plus (1M token context) - [/^coder-model$/, LIMITS['1m']], - - // Commercial Qwen3-Max-Preview: 256K token context - [/^qwen3-max(-preview)?(-.*)?$/, LIMITS['256k']], // catches "qwen3-max" or "qwen3-max-preview" and date variants - - // Open-source Qwen3-Coder variants: 256K native - [/^qwen3-coder-.*$/, LIMITS['256k']], - // Open-source Qwen3 2507 variants: 256K native - [/^qwen3-.*-2507-.*$/, LIMITS['256k']], - - // Open-source long-context Qwen2.5-1M - [/^qwen2\.5-1m.*$/, LIMITS['1m']], - - // Standard Qwen2.5: 128K - [/^qwen2\.5.*$/, LIMITS['128k']], - - // Studio commercial Qwen-Plus / Qwen-Flash / Qwen-Turbo - [/^qwen-plus-latest$/, LIMITS['1m']], // Commercial latest: 1M - [/^qwen-plus.*$/, LIMITS['128k']], // Standard: 128K + // Commercial API models (1,000,000 context) + [/^qwen3-coder-plus/, LIMITS['1m']], + [/^qwen3-coder-flash/, LIMITS['1m']], + [/^qwen3\.5-plus/, LIMITS['1m']], + [/^qwen-plus-latest$/, LIMITS['1m']], [/^qwen-flash-latest$/, LIMITS['1m']], - [/^qwen-turbo.*$/, LIMITS['128k']], - - // Qwen Vision Models - [/^qwen3-vl-plus$/, LIMITS['256k']], // Qwen3-VL-Plus: 256K input - [/^qwen-vl-max.*$/, LIMITS['128k']], + [/^coder-model$/, LIMITS['1m']], + // Commercial API models (256K context) + [/^qwen3-max/, LIMITS['256k']], + // Open-source Qwen3 variants: 256K native + [/^qwen3-coder-/, LIMITS['256k']], + // Qwen fallback (VL, turbo, plus, 2.5, etc.): 128K + [/^qwen/, LIMITS['256k']], // ------------------- - // ByteDance Seed-OSS (512K) + // DeepSeek // ------------------- - [/^seed-oss.*$/, LIMITS['512k']], + [/^deepseek/, LIMITS['128k']], // ------------------- // Zhipu GLM // ------------------- - [/^glm-4\.5v(?:-.*)?$/, LIMITS['64k']], - [/^glm-4\.5-air(?:-.*)?$/, LIMITS['128k']], - [/^glm-4\.5(?:-.*)?$/, LIMITS['128k']], - [/^glm-4\.6(?:-.*)?$/, 202_752 as unknown as TokenCount], // exact limit from the model config file - [/^glm-4\.7(?:-.*)?$/, LIMITS['200k']], + [/^glm-5/, 202_752 as TokenCount], // GLM-5: exact vendor limit + [/^glm-/, 202_752 as TokenCount], // GLM fallback: 128K // ------------------- - // DeepSeek + // MiniMax // ------------------- - [/^deepseek(?:-.*)?$/, LIMITS['128k']], + [/^minimax-m2\.5/i, LIMITS['1m']], // MiniMax-M2.5: 1,000,000 + [/^minimax-/i, LIMITS['200k']], // MiniMax fallback: 200K // ------------------- // Moonshot / Kimi // ------------------- - [/^kimi-2\.5.*$/, LIMITS['256k']], // Kimi-2.5: 256K context - [/^kimi-k2.*$/, LIMITS['256k']], // Kimi-k2 variants: 256K context - - // ------------------- - // GPT-OSS / Llama & Mistral examples - // ------------------- - [/^gpt-oss.*$/, LIMITS['128k']], - [/^llama-4-scout.*$/, LIMITS['10m']], - [/^mistral-large-2.*$/, LIMITS['128k']], + [/^kimi-/, LIMITS['256k']], // Kimi fallback: 256K // ------------------- - // MiniMax + // ByteDance Seed-OSS (512K) // ------------------- - [/^minimax-m2\.1.*$/i, LIMITS['200k']], // MiniMax-M2.1: 200K context + [/^seed-oss/, LIMITS['512k']], ]; /** @@ -193,32 +147,38 @@ const PATTERNS: Array<[RegExp, TokenCount]> = [ * in a single response for specific models. */ const OUTPUT_PATTERNS: Array<[RegExp, TokenCount]> = [ - // ------------------- - // Alibaba / Qwen - DashScope Models - // ------------------- - // Qwen3-Coder-Plus: 65,536 max output tokens - [/^qwen3-coder-plus(-.*)?$/, LIMITS['64k']], + // Google Gemini + [/^gemini-3/, LIMITS['64k']], // Gemini 3.x: 64K + [/^gemini-/, LIMITS['8k']], // Gemini fallback: 8K - // Qwen3.5-Plus: 65,536 max output tokens - [/^qwen3\.5-plus(-.*)?$/, LIMITS['64k']], + // OpenAI + [/^gpt-5/, LIMITS['128k']], // GPT-5.x: 128K + [/^gpt-/, LIMITS['16k']], // GPT fallback: 16K + [/^o\d/, LIMITS['128k']], // o-series: 128K - // Generic coder-model: same as qwen3.5-plus (64K max output tokens) - [/^coder-model$/, LIMITS['64k']], + // Anthropic Claude + [/^claude-opus-4-6/, LIMITS['128k']], // Opus 4.6: 128K + [/^claude-sonnet-4-6/, LIMITS['64k']], // Sonnet 4.6: 64K + [/^claude-/, LIMITS['64k']], // Claude fallback: 64K - // Qwen3-Max: 65,536 max output tokens - [/^qwen3-max(-preview)?(-.*)?$/, LIMITS['64k']], + // Alibaba / Qwen + [/^qwen3\.5/, LIMITS['64k']], + [/^coder-model$/, LIMITS['64k']], + [/^qwen3-max/, LIMITS['64k']], - // Qwen-VL-Max-Latest: 8,192 max output tokens - [/^qwen-vl-max-latest$/, LIMITS['8k']], + // DeepSeek + [/^deepseek-reasoner/, LIMITS['64k']], + [/^deepseek-chat/, LIMITS['8k']], - // Qwen3-VL-Plus: 32K max output tokens - [/^qwen3-vl-plus$/, LIMITS['32k']], + // Zhipu GLM + [/^glm-5/, LIMITS['16k']], + [/^glm-4\.7/, LIMITS['16k']], - // Deepseek-chat: 8k max tokens - [/^deepseek-chat$/, LIMITS['8k']], + // MiniMax + [/^minimax-m2\.5/i, LIMITS['64k']], - // Deepseek-reasoner: 64k max tokens - [/^deepseek-reasoner$/, LIMITS['64k']], + // Kimi + [/^kimi-k2\.5/, LIMITS['32k']], ]; /** diff --git a/packages/core/src/models/constants.ts b/packages/core/src/models/constants.ts index 025e3b9cfd..4551a2f43d 100644 --- a/packages/core/src/models/constants.ts +++ b/packages/core/src/models/constants.ts @@ -28,6 +28,7 @@ export const MODEL_GENERATION_CONFIG_FIELDS = [ 'contextWindowSize', 'customHeaders', 'extra_body', + 'modalities', ] as const satisfies ReadonlyArray; /** diff --git a/packages/core/src/models/modelRegistry.ts b/packages/core/src/models/modelRegistry.ts index 7b9bdad773..c2815fb329 100644 --- a/packages/core/src/models/modelRegistry.ts +++ b/packages/core/src/models/modelRegistry.ts @@ -5,6 +5,8 @@ */ import { AuthType } from '../core/contentGenerator.js'; +import { defaultModalities } from '../core/modalityDefaults.js'; +import { tokenLimit } from '../core/tokenLimits.js'; import { DEFAULT_OPENAI_BASE_URL } from '../core/openaiContentGenerator/constants.js'; import { type ModelConfig, @@ -121,7 +123,12 @@ export class ModelRegistry { capabilities: model.capabilities, authType: model.authType, isVision: model.capabilities?.vision ?? false, - contextWindowSize: model.generationConfig.contextWindowSize, + contextWindowSize: + model.generationConfig.contextWindowSize ?? tokenLimit(model.id), + modalities: + model.generationConfig.modalities ?? defaultModalities(model.id), + baseUrl: model.baseUrl, + envKey: model.envKey, })); } diff --git a/packages/core/src/models/modelsConfig.ts b/packages/core/src/models/modelsConfig.ts index a77d1d06b1..d22cc790cb 100644 --- a/packages/core/src/models/modelsConfig.ts +++ b/packages/core/src/models/modelsConfig.ts @@ -11,6 +11,7 @@ import type { ContentGeneratorConfig } from '../core/contentGenerator.js'; import type { ContentGeneratorConfigSources } from '../core/contentGenerator.js'; import { DEFAULT_QWEN_MODEL } from '../config/models.js'; import { tokenLimit } from '../core/tokenLimits.js'; +import { defaultModalities } from '../core/modalityDefaults.js'; import { ModelRegistry } from './modelRegistry.js'; import { @@ -770,6 +771,15 @@ export class ModelsConfig { detail: 'auto-detected from model', }; } + + // modalities fallback: auto-detect from model when not set by provider + if (gc.modalities === undefined) { + this._generationConfig.modalities = defaultModalities(model.id); + this.generationConfigSources['modalities'] = { + kind: 'computed', + detail: 'auto-detected from model', + }; + } } /** diff --git a/packages/core/src/models/types.ts b/packages/core/src/models/types.ts index 69c286729f..5c9c9b51d0 100644 --- a/packages/core/src/models/types.ts +++ b/packages/core/src/models/types.ts @@ -7,6 +7,7 @@ import type { AuthType, ContentGeneratorConfig, + InputModalities, } from '../core/contentGenerator.js'; import type { ConfigSources } from '../utils/configResolver.js'; @@ -35,6 +36,7 @@ export type ModelGenerationConfig = Pick< | 'customHeaders' | 'extra_body' | 'contextWindowSize' + | 'modalities' >; /** @@ -93,6 +95,9 @@ export interface AvailableModel { authType: AuthType; isVision?: boolean; contextWindowSize?: number; + modalities?: InputModalities; + baseUrl?: string; + envKey?: string; /** Whether this is a runtime model (not from modelProviders) */ isRuntimeModel?: boolean; diff --git a/packages/core/src/tools/read-file.test.ts b/packages/core/src/tools/read-file.test.ts index 4972f26e73..ec07a69955 100644 --- a/packages/core/src/tools/read-file.test.ts +++ b/packages/core/src/tools/read-file.test.ts @@ -231,8 +231,8 @@ describe('ReadFileTool', () => { it('should return error for a file that is too large', async () => { const filePath = path.join(tempRootDir, 'largefile.txt'); - // 21MB of content exceeds 20MB limit - const largeContent = 'x'.repeat(21 * 1024 * 1024); + // 11MB of content exceeds 10MB limit + const largeContent = 'x'.repeat(11 * 1024 * 1024); await fsp.writeFile(filePath, largeContent, 'utf-8'); const params: ReadFileToolParams = { absolute_path: filePath }; const invocation = tool.build(params) as ToolInvocation< @@ -244,7 +244,7 @@ describe('ReadFileTool', () => { expect(result).toHaveProperty('error'); expect(result.error?.type).toBe(ToolErrorType.FILE_TOO_LARGE); expect(result.error?.message).toContain( - 'File size exceeds the 20MB limit', + 'File size exceeds the 10MB limit', ); }); diff --git a/packages/core/src/utils/fileUtils.test.ts b/packages/core/src/utils/fileUtils.test.ts index da9f257fdf..b21ee79e25 100644 --- a/packages/core/src/utils/fileUtils.test.ts +++ b/packages/core/src/utils/fileUtils.test.ts @@ -948,13 +948,13 @@ describe('fileUtils', () => { ); }); - it('should return an error if the file size exceeds 20MB', async () => { + it('should return an error if the file size exceeds 10MB', async () => { // Create a small test file actualNodeFs.writeFileSync(testTextFilePath, 'test content'); // Spy on fs.promises.stat to return a large file size const statSpy = vi.spyOn(fs.promises, 'stat').mockResolvedValueOnce({ - size: 21 * 1024 * 1024, + size: 11 * 1024 * 1024, isDirectory: () => false, } as fs.Stats); @@ -964,11 +964,11 @@ describe('fileUtils', () => { mockConfig, ); - expect(result.error).toContain('File size exceeds the 20MB limit'); + expect(result.error).toContain('File size exceeds the 10MB limit'); expect(result.returnDisplay).toContain( - 'File size exceeds the 20MB limit', + 'File size exceeds the 10MB limit', ); - expect(result.llmContent).toContain('File size exceeds the 20MB limit'); + expect(result.llmContent).toContain('File size exceeds the 10MB limit'); } finally { statSpy.mockRestore(); } diff --git a/packages/core/src/utils/fileUtils.ts b/packages/core/src/utils/fileUtils.ts index 3e4124d185..aab6935cbb 100644 --- a/packages/core/src/utils/fileUtils.ts +++ b/packages/core/src/utils/fileUtils.ts @@ -340,11 +340,12 @@ export async function processSingleFileContent( } const fileSizeInMB = stats.size / (1024 * 1024); - if (fileSizeInMB > 20) { + // Use 9.9MB instead of 10MB to leave margin for encoding overhead (#1880) + if (fileSizeInMB > 9.9) { return { - llmContent: 'File size exceeds the 20MB limit.', - returnDisplay: 'File size exceeds the 20MB limit.', - error: `File size exceeds the 20MB limit: ${filePath} (${fileSizeInMB.toFixed(2)}MB)`, + llmContent: 'File size exceeds the 10MB limit.', + returnDisplay: 'File size exceeds the 10MB limit.', + error: `File size exceeds the 10MB limit: ${filePath} (${fileSizeInMB.toFixed(2)}MB)`, errorType: ToolErrorType.FILE_TOO_LARGE, }; } @@ -465,6 +466,16 @@ export async function processSingleFileContent( case 'pdf': { const contentBuffer = await fs.promises.readFile(filePath); const base64Data = contentBuffer.toString('base64'); + const base64SizeInMB = base64Data.length / (1024 * 1024); + // Use 9.9MB instead of 10MB to leave margin for small overhead (#1880) + if (base64SizeInMB > 9.9) { + return { + llmContent: `File exceeds the 10MB data URI limit after base64 encoding (${base64SizeInMB.toFixed(2)}MB encoded).`, + returnDisplay: `File exceeds the 10MB data URI limit after base64 encoding.`, + error: `File exceeds the 10MB data URI limit after base64 encoding: ${filePath} (${base64SizeInMB.toFixed(2)}MB encoded)`, + errorType: ToolErrorType.FILE_TOO_LARGE, + }; + } return { llmContent: { inlineData: { diff --git a/packages/core/src/utils/pathReader.test.ts b/packages/core/src/utils/pathReader.test.ts index 5de10765b2..282a7d6d1e 100644 --- a/packages/core/src/utils/pathReader.test.ts +++ b/packages/core/src/utils/pathReader.test.ts @@ -392,8 +392,8 @@ describe('readPathFromWorkspace', () => { ); it('should return an error string for files exceeding the size limit', async () => { - // Mock a file slightly larger than the 20MB limit defined in fileUtils.ts - const largeContent = 'a'.repeat(21 * 1024 * 1024); // 21MB + // Mock a file slightly larger than the 10MB limit defined in fileUtils.ts + const largeContent = 'a'.repeat(11 * 1024 * 1024); // 11MB mock({ [CWD]: { 'large.txt': largeContent, @@ -406,6 +406,6 @@ describe('readPathFromWorkspace', () => { const result = await readPathFromWorkspace('large.txt', config); const textResult = result[0] as string; // The error message comes directly from processSingleFileContent - expect(textResult).toBe('File size exceeds the 20MB limit.'); + expect(textResult).toBe('File size exceeds the 10MB limit.'); }); });