diff --git a/.github/workflows/build-cuda.yml b/.github/workflows/build-cuda.yml
new file mode 100644
index 00000000..7076b3f4
--- /dev/null
+++ b/.github/workflows/build-cuda.yml
@@ -0,0 +1,73 @@
+name: Build CUDA Backend
+
+on:
+  workflow_dispatch:
+  push:
+    tags:
+      - "v*"
+
+jobs:
+  build-cuda-windows:
+    runs-on: windows-latest
+    permissions:
+      contents: write
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+          cache: "pip"
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pyinstaller
+          pip install -r backend/requirements.txt
+
+      - name: Install PyTorch with CUDA 12.1
+        run: |
+          pip install torch --index-url https://download.pytorch.org/whl/cu121 --force-reinstall --no-deps
+          pip install torchaudio --index-url https://download.pytorch.org/whl/cu121
+
+      - name: Verify CUDA support in torch
+        run: |
+          python -c "import torch; print(f'CUDA available in build: {torch.cuda.is_available()}'); print(f'CUDA version: {torch.version.cuda}')"
+
+      - name: Build CUDA server binary
+        shell: bash
+        working-directory: backend
+        run: python build_binary.py --cuda
+
+      - name: Split binary for GitHub Releases
+        shell: bash
+        run: |
+          python scripts/split_binary.py \
+            backend/dist/voicebox-server-cuda.exe \
+            --output release-assets/
+
+      - name: Upload split parts to GitHub Release
+        if: startsWith(github.ref, 'refs/tags/')
+        uses: softprops/action-gh-release@v1
+        with:
+          files: |
+            release-assets/voicebox-server-cuda.part*.exe
+            release-assets/voicebox-server-cuda.sha256
+            release-assets/voicebox-server-cuda.manifest
+          draft: true
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Upload binary as workflow artifact (for testing)
+        uses: actions/upload-artifact@v4
+        with:
+          name: voicebox-server-cuda-windows
+          path: backend/dist/voicebox-server-cuda.exe
+          retention-days: 7
+
+  # Linux CUDA build can be added later with:
+  # build-cuda-linux:
+  #   runs-on: ubuntu-22.04
+  #   ...
diff --git a/app/src/components/ServerSettings/GpuAcceleration.tsx b/app/src/components/ServerSettings/GpuAcceleration.tsx
new file mode 100644
index 00000000..17da7690
--- /dev/null
+++ b/app/src/components/ServerSettings/GpuAcceleration.tsx
@@ -0,0 +1,387 @@
+import { useQuery, useQueryClient } from '@tanstack/react-query';
+import { AlertCircle, Cpu, Download, Loader2, RotateCw, Trash2, Zap } from 'lucide-react';
+import { useCallback, useEffect, useRef, useState } from 'react';
+import { Badge } from '@/components/ui/badge';
+import { Button } from '@/components/ui/button';
+import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
+import { Progress } from '@/components/ui/progress';
+import { apiClient } from '@/lib/api/client';
+import type { CudaDownloadProgress } from '@/lib/api/types';
+import { useServerHealth } from '@/lib/hooks/useServer';
+import { usePlatform } from '@/platform/PlatformContext';
+import { useServerStore } from '@/stores/serverStore';
+
+type RestartPhase = 'idle' | 'stopping' | 'waiting' | 'ready';
+
+export function GpuAcceleration() {
+  const platform = usePlatform();
+  const queryClient = useQueryClient();
+  const serverUrl = useServerStore((state) => state.serverUrl);
+  const { data: health } = useServerHealth();
+
+  const [restartPhase, setRestartPhase] = useState<RestartPhase>('idle');
+  const [error, setError] = useState<string | null>(null);
+  const [downloadProgress, setDownloadProgress] = useState<CudaDownloadProgress | null>(null);
+  const healthPollRef = useRef<ReturnType<typeof setInterval> | null>(null);
+
+  // Query CUDA backend status
+  const {
+    data: cudaStatus,
+    isLoading: cudaStatusLoading,
+    refetch: refetchCudaStatus,
+  } = useQuery({
+    queryKey: ['cuda-status', serverUrl],
+    queryFn: () => apiClient.getCudaStatus(),
+    refetchInterval: cudaStatusLoading ? false : 10000,
+    retry: 1,
+    enabled: !!health, // Only fetch when backend is reachable
+  });
+
+  // Derived state
+  const isCurrentlyCuda = health?.backend_variant === 'cuda';
+  const cudaAvailable = cudaStatus?.available ?? false;
+  const cudaDownloading = cudaStatus?.downloading ?? false;
+
+  // Clean up health poll on unmount
+  useEffect(() => {
+    return () => {
+      if (healthPollRef.current) {
+        clearInterval(healthPollRef.current);
+        healthPollRef.current = null;
+      }
+    };
+  }, []);
+
+  // SSE progress tracking during download
+  useEffect(() => {
+    if (!cudaDownloading || !serverUrl) {
+      return;
+    }
+
+    const eventSource = new EventSource(`${serverUrl}/backend/cuda-progress`);
+
+    eventSource.onmessage = (event) => {
+      try {
+        const data = JSON.parse(event.data) as CudaDownloadProgress;
+        setDownloadProgress(data);
+
+        if (data.status === 'complete') {
+          eventSource.close();
+          setDownloadProgress(null);
+          refetchCudaStatus();
+        } else if (data.status === 'error') {
+          eventSource.close();
+          setError(data.error || 'Download failed');
+          setDownloadProgress(null);
+          refetchCudaStatus();
+        }
+      } catch (e) {
+        console.error('Error parsing CUDA progress event:', e);
+      }
+    };
+
+    eventSource.onerror = () => {
+      eventSource.close();
+    };
+
+    return () => {
+      eventSource.close();
+    };
+  }, [cudaDownloading, serverUrl, refetchCudaStatus]);
+
+  // Start aggressive health polling during restart
+  const startHealthPolling = useCallback(() => {
+    if (healthPollRef.current) return;
+
+    healthPollRef.current = setInterval(async () => {
+      try {
+        const result = await apiClient.getHealth();
+        if (result.status === 'healthy') {
+          // Server is back up
+          if (healthPollRef.current) {
+            clearInterval(healthPollRef.current);
+            healthPollRef.current = null;
+          }
+          setRestartPhase('ready');
+          // Invalidate all queries to refresh UI
+          queryClient.invalidateQueries();
+          // Reset after a moment
+          setTimeout(() => setRestartPhase('idle'), 2000);
+        }
+      } catch {
+        // Server still down, keep polling
+      }
+    }, 1000);
+  }, [queryClient]);
+
+  const handleDownload = async () => {
+    setError(null);
+    try {
+      await apiClient.downloadCudaBackend();
+      refetchCudaStatus();
+    } catch (e: unknown) {
+      const msg = e instanceof Error ? e.message : 'Failed to start download';
+      if (msg.includes('already downloaded')) {
+        refetchCudaStatus();
+      } else {
+        setError(msg);
+      }
+    }
+  };
+
+  const handleRestart = async () => {
+    setError(null);
+    setRestartPhase('stopping');
+
+    try {
+      setRestartPhase('waiting');
+      startHealthPolling();
+      await platform.lifecycle.restartServer();
+      // Invoke resolved — server is likely ready. Stop polling and refresh.
+      if (healthPollRef.current) {
+        clearInterval(healthPollRef.current);
+        healthPollRef.current = null;
+      }
+      setRestartPhase('ready');
+      queryClient.invalidateQueries();
+      setTimeout(() => setRestartPhase('idle'), 2000);
+    } catch (e: unknown) {
+      setRestartPhase('idle');
+      if (healthPollRef.current) {
+        clearInterval(healthPollRef.current);
+        healthPollRef.current = null;
+      }
+      setError(e instanceof Error ? e.message : 'Restart failed');
+    }
+  };
+
+  const handleSwitchToCpu = async () => {
+    // To switch to CPU: delete the CUDA binary, then restart.
+    // start_server always prefers CUDA if present, so we must remove it first.
+    setError(null);
+    setRestartPhase('stopping');
+
+    try {
+      await apiClient.deleteCudaBackend();
+      setRestartPhase('waiting');
+      startHealthPolling();
+      await platform.lifecycle.restartServer();
+      // Invoke resolved — server is likely ready
+      if (healthPollRef.current) {
+        clearInterval(healthPollRef.current);
+        healthPollRef.current = null;
+      }
+      setRestartPhase('ready');
+      queryClient.invalidateQueries();
+      setTimeout(() => setRestartPhase('idle'), 2000);
+    } catch (e: unknown) {
+      setRestartPhase('idle');
+      if (healthPollRef.current) {
+        clearInterval(healthPollRef.current);
+        healthPollRef.current = null;
+      }
+      setError(e instanceof Error ? e.message : 'Failed to switch to CPU');
+      refetchCudaStatus();
+    }
+  };
+
+  const handleDelete = async () => {
+    setError(null);
+    try {
+      await apiClient.deleteCudaBackend();
+      refetchCudaStatus();
+    } catch (e: unknown) {
+      setError(e instanceof Error ? e.message : 'Failed to delete CUDA backend');
+    }
+  };
+
+  const formatBytes = (bytes: number): string => {
+    if (bytes === 0) return '0 B';
+    const k = 1024;
+    const sizes = ['B', 'KB', 'MB', 'GB'];
+    const i = Math.floor(Math.log(bytes) / Math.log(k));
+    return `${(bytes / k ** i).toFixed(1)} ${sizes[i]}`;
+  };
+
+  // Don't render until health data is available
+  if (!health) return null;
+
+  // If the system already has native GPU (MPS, etc.), only show info - no CUDA needed
+  const hasNativeGpu =
+    health.gpu_available &&
+    !isCurrentlyCuda &&
+    health.gpu_type &&
+    !health.gpu_type.includes('CUDA');
+
+  return (
+    <Card>
+      <CardHeader>
+        <CardTitle className="flex items-center gap-2">
+          <Zap className="h-4 w-4" />
+          GPU Acceleration
+        </CardTitle>
+      </CardHeader>
+      <CardContent className="space-y-4">
+        {/* Current status */}
+        <div className="flex items-center justify-between">
+          <div className="space-y-1">
+            <div className="text-sm font-medium">Backend</div>
+            <div className="text-sm text-muted-foreground">
+              {isCurrentlyCuda ? 'CUDA (GPU accelerated)' : 'CPU'}
+            </div>
+          </div>
+          <Badge variant={isCurrentlyCuda ? 'default' : 'secondary'}>
+            {isCurrentlyCuda ? (
+              <>
+                <Zap className="h-3 w-3 mr-1" /> CUDA
+              </>
+            ) : (
+              <>
+                <Cpu className="h-3 w-3 mr-1" /> CPU
+              </>
+            )}
+          </Badge>
+        </div>
+
+        {/* GPU info from health */}
+        {health.gpu_type && (
+          <div className="space-y-1">
+            <div className="text-sm font-medium">GPU</div>
+            <div className="text-sm text-muted-foreground">{health.gpu_type}</div>
+            {health.vram_used_mb != null && (
+              <div className="text-xs text-muted-foreground">
+                VRAM: {health.vram_used_mb.toFixed(0)} MB used
+              </div>
+            )}
+          </div>
+        )}
+
+        {/* Native GPU detected - no CUDA download needed */}
+        {hasNativeGpu && (
+          <div className="p-3 rounded-lg bg-accent/10 border border-accent/20">
+            <div className="text-sm">
+              Your system uses <strong>{health.gpu_type}</strong> for acceleration. No additional
+              downloads needed.
+            </div>
+          </div>
+        )}
+
+        {/* CUDA download section - only show when native GPU is NOT detected (i.e., Windows/Linux NVIDIA users) */}
+        {!hasNativeGpu && (
+          <>
+            {/* Download progress */}
+            {cudaDownloading && downloadProgress && (
+              <div className="space-y-2">
+                <div className="flex items-center justify-between text-sm">
+                  <div className="flex items-center gap-2">
+                    <Loader2 className="h-4 w-4 animate-spin" />
+                    <span>{downloadProgress.filename || 'Downloading CUDA backend...'}</span>
+                  </div>
+                  {downloadProgress.total > 0 && (
+                    <span className="text-muted-foreground">
+                      {downloadProgress.progress.toFixed(1)}%
+                    </span>
+                  )}
+                </div>
+                {downloadProgress.total > 0 && (
+                  <>
+                    <Progress value={downloadProgress.progress} className="h-2" />
+                    <div className="text-xs text-muted-foreground">
+                      {formatBytes(downloadProgress.current)} /{' '}
+                      {formatBytes(downloadProgress.total)}
+                    </div>
+                  </>
+                )}
+              </div>
+            )}
+
+            {/* Restart in progress */}
+            {restartPhase !== 'idle' && (
+              <div className="flex items-center gap-2 p-3 rounded-lg bg-primary/5 border">
+                <Loader2 className="h-4 w-4 animate-spin" />
+                <span className="text-sm">
+                  {restartPhase === 'stopping' && 'Stopping server...'}
+                  {restartPhase === 'waiting' && 'Restarting server...'}
+                  {restartPhase === 'ready' && 'Server restarted successfully!'}
+                </span>
+              </div>
+            )}
+
+            {/* Error display */}
+            {error && (
+              <div className="flex items-center gap-2 text-sm text-destructive">
+                <AlertCircle className="h-4 w-4 shrink-0" />
+                <span>{error}</span>
+              </div>
+            )}
+
+            {/* Actions */}
+            {restartPhase === 'idle' && !cudaDownloading && (
+              <div className="space-y-2">
+                {/* Not downloaded yet - show download button */}
+                {!cudaAvailable && (
+                  <div className="space-y-3">
+                    <p className="text-sm text-muted-foreground">
+                      Download the CUDA backend (~2.4 GB) for NVIDIA GPU acceleration. Requires an
+                      NVIDIA GPU with CUDA support.
+                    </p>
+                    <Button onClick={handleDownload} className="w-full" size="sm">
+                      <Download className="h-4 w-4 mr-2" />
+                      Download CUDA Backend
+                    </Button>
+                  </div>
+                )}
+
+                {/* Downloaded but not active - show switch button */}
+                {cudaAvailable && !isCurrentlyCuda && platform.metadata.isTauri && (
+                  <div className="space-y-3">
+                    <p className="text-sm text-muted-foreground">
+                      CUDA backend is downloaded and ready. Restart the server to enable GPU
+                      acceleration.
+                    </p>
+                    <Button onClick={handleRestart} className="w-full" size="sm">
+                      <RotateCw className="h-4 w-4 mr-2" />
+                      Switch to CUDA Backend
+                    </Button>
+                  </div>
+                )}
+
+                {/* Currently active - show switch back to CPU */}
+                {isCurrentlyCuda && platform.metadata.isTauri && (
+                  <div className="space-y-3">
+                    <p className="text-sm text-muted-foreground">
+                      Running with CUDA GPU acceleration. Switch back to CPU if needed (you can
+                      re-download later).
+                    </p>
+                    <Button
+                      onClick={handleSwitchToCpu}
+                      variant="outline"
+                      className="w-full"
+                      size="sm"
+                    >
+                      <RotateCw className="h-4 w-4 mr-2" />
+                      Switch to CPU Backend
+                    </Button>
+                  </div>
+                )}
+
+                {/* Delete option when downloaded (and not active) */}
+                {cudaAvailable && !isCurrentlyCuda && (
+                  <Button
+                    onClick={handleDelete}
+                    variant="ghost"
+                    className="w-full text-muted-foreground hover:text-destructive"
+                    size="sm"
+                  >
+                    <Trash2 className="h-4 w-4 mr-2" />
+                    Remove CUDA Backend
+                  </Button>
+                )}
+              </div>
+            )}
+          </>
+        )}
+      </CardContent>
+    </Card>
+  );
+}
diff --git a/app/src/components/ServerTab/ServerTab.tsx b/app/src/components/ServerTab/ServerTab.tsx
index abf91ac2..1f32ac04 100644
--- a/app/src/components/ServerTab/ServerTab.tsx
+++ b/app/src/components/ServerTab/ServerTab.tsx
@@ -1,4 +1,5 @@
 import { ConnectionForm } from '@/components/ServerSettings/ConnectionForm';
+import { GpuAcceleration } from '@/components/ServerSettings/GpuAcceleration';
 import { ServerStatus } from '@/components/ServerSettings/ServerStatus';
 import { UpdateStatus } from '@/components/ServerSettings/UpdateStatus';
 import { usePlatform } from '@/platform/PlatformContext';
@@ -11,6 +12,7 @@ export function ServerTab() {
         <ConnectionForm />
         <ServerStatus />
       </div>
+      {platform.metadata.isTauri && <GpuAcceleration />}
       {platform.metadata.isTauri && <UpdateStatus />}
       <div className="py-8 text-center text-sm text-muted-foreground">
         Created by{' '}
diff --git a/app/src/lib/api/client.ts b/app/src/lib/api/client.ts
index ce54d2bc..eb78e440 100644
--- a/app/src/lib/api/client.ts
+++ b/app/src/lib/api/client.ts
@@ -1,29 +1,30 @@
-import { useServerStore } from '@/stores/serverStore';
 import type { LanguageCode } from '@/lib/constants/languages';
+import { useServerStore } from '@/stores/serverStore';
 import type {
-  VoiceProfileCreate,
-  VoiceProfileResponse,
-  ProfileSampleResponse,
+  ActiveTasksResponse,
+  CudaStatus,
   GenerationRequest,
   GenerationResponse,
-  HistoryQuery,
+  HealthResponse,
   HistoryListResponse,
+  HistoryQuery,
   HistoryResponse,
-  TranscriptionResponse,
-  HealthResponse,
-  ModelStatusListResponse,
   ModelDownloadRequest,
-  ActiveTasksResponse,
+  ModelStatusListResponse,
+  ProfileSampleResponse,
   StoryCreate,
-  StoryResponse,
   StoryDetailResponse,
+  StoryItemBatchUpdate,
   StoryItemCreate,
   StoryItemDetail,
-  StoryItemBatchUpdate,
-  StoryItemReorder,
   StoryItemMove,
-  StoryItemTrim,
+  StoryItemReorder,
   StoryItemSplit,
+  StoryItemTrim,
+  StoryResponse,
+  TranscriptionResponse,
+  VoiceProfileCreate,
+  VoiceProfileResponse,
 } from './types';
 
 class ApiClient {
@@ -251,7 +252,13 @@ class ApiClient {
     return response.blob();
   }
 
-  async importGeneration(file: File): Promise<{ id: string; profile_id: string; profile_name: string; text: string; message: string }> {
+  async importGeneration(file: File): Promise<{
+    id: string;
+    profile_id: string;
+    profile_name: string;
+    text: string;
+    message: string;
+  }> {
     const url = `${this.getBaseUrl()}/history/import`;
     const formData = new FormData();
     formData.append('file', file);
@@ -310,7 +317,12 @@ class ApiClient {
   }
 
   async triggerModelDownload(modelName: string): Promise<{ message: string }> {
-    console.log('[API] triggerModelDownload called for:', modelName, 'at', new Date().toISOString());
+    console.log(
+      '[API] triggerModelDownload called for:',
+      modelName,
+      'at',
+      new Date().toISOString(),
+    );
     const result = await this.request<{ message: string }>('/models/download', {
       method: 'POST',
       body: JSON.stringify({ model_name: modelName } as ModelDownloadRequest),
@@ -354,10 +366,7 @@ class ApiClient {
     return this.request('/channels');
   }
 
-  async createChannel(data: {
-    name: string;
-    device_ids: string[];
-  }): Promise<{
+  async createChannel(data: { name: string; device_ids: string[] }): Promise<{
     id: string;
     name: string;
     is_default: boolean;
@@ -399,10 +408,7 @@ class ApiClient {
     return this.request(`/channels/${channelId}/voices`);
   }
 
-  async setChannelVoices(
-    channelId: string,
-    profileIds: string[],
-  ): Promise<{ message: string }> {
+  async setChannelVoices(channelId: string, profileIds: string[]): Promise<{ message: string }> {
     return this.request(`/channels/${channelId}/voices`, {
       method: 'PUT',
       body: JSON.stringify({ profile_ids: profileIds }),
@@ -413,16 +419,30 @@ class ApiClient {
     return this.request(`/profiles/${profileId}/channels`);
   }
 
-  async setProfileChannels(
-    profileId: string,
-    channelIds: string[],
-  ): Promise<{ message: string }> {
+  async setProfileChannels(profileId: string, channelIds: string[]): Promise<{ message: string }> {
     return this.request(`/profiles/${profileId}/channels`, {
       method: 'PUT',
       body: JSON.stringify({ channel_ids: channelIds }),
     });
   }
 
+  // CUDA Backend Management
+  async getCudaStatus(): Promise<CudaStatus> {
+    return this.request<CudaStatus>('/backend/cuda-status');
+  }
+
+  async downloadCudaBackend(): Promise<{ message: string; progress_key: string }> {
+    return this.request<{ message: string; progress_key: string }>('/backend/download-cuda', {
+      method: 'POST',
+    });
+  }
+
+  async deleteCudaBackend(): Promise<{ message: string }> {
+    return this.request<{ message: string }>('/backend/cuda', {
+      method: 'DELETE',
+    });
+  }
+
   // Stories
   async listStories(): Promise<StoryResponse[]> {
     return this.request<StoryResponse[]>('/stories');
@@ -479,21 +499,33 @@ class ApiClient {
     });
   }
 
-  async moveStoryItem(storyId: string, itemId: string, data: StoryItemMove): Promise<StoryItemDetail> {
+  async moveStoryItem(
+    storyId: string,
+    itemId: string,
+    data: StoryItemMove,
+  ): Promise<StoryItemDetail> {
     return this.request<StoryItemDetail>(`/stories/${storyId}/items/${itemId}/move`, {
       method: 'PUT',
       body: JSON.stringify(data),
     });
   }
 
-  async trimStoryItem(storyId: string, itemId: string, data: StoryItemTrim): Promise<StoryItemDetail> {
+  async trimStoryItem(
+    storyId: string,
+    itemId: string,
+    data: StoryItemTrim,
+  ): Promise<StoryItemDetail> {
     return this.request<StoryItemDetail>(`/stories/${storyId}/items/${itemId}/trim`, {
       method: 'PUT',
       body: JSON.stringify(data),
     });
   }
 
-  async splitStoryItem(storyId: string, itemId: string, data: StoryItemSplit): Promise<StoryItemDetail[]> {
+  async splitStoryItem(
+    storyId: string,
+    itemId: string,
+    data: StoryItemSplit,
+  ): Promise<StoryItemDetail[]> {
     return this.request<StoryItemDetail[]>(`/stories/${storyId}/items/${itemId}/split`, {
       method: 'POST',
       body: JSON.stringify(data),
diff --git a/app/src/lib/api/types.ts b/app/src/lib/api/types.ts
index e4cfe1fb..0baeb52a 100644
--- a/app/src/lib/api/types.ts
+++ b/app/src/lib/api/types.ts
@@ -78,7 +78,29 @@ export interface HealthResponse {
   model_downloaded?: boolean;
   model_size?: string;
   gpu_available: boolean;
+  gpu_type?: string;
   vram_used_mb?: number;
+  backend_type?: string;
+  backend_variant?: string; // "cpu" or "cuda"
+}
+
+export interface CudaDownloadProgress {
+  model_name: string;
+  current: number;
+  total: number;
+  progress: number;
+  filename?: string;
+  status: 'downloading' | 'extracting' | 'complete' | 'error';
+  timestamp: string;
+  error?: string;
+}
+
+export interface CudaStatus {
+  available: boolean; // CUDA binary exists on disk
+  active: boolean; // Currently running the CUDA binary
+  binary_path?: string;
+  downloading: boolean; // Download in progress
+  download_progress?: CudaDownloadProgress;
 }
 
 export interface ModelProgress {
@@ -96,7 +118,7 @@ export interface ModelStatus {
   model_name: string;
   display_name: string;
   downloaded: boolean;
-  downloading: boolean;  // True if download is in progress
+  downloading: boolean; // True if download is in progress
   size_mb?: number;
   loaded: boolean;
 }
diff --git a/app/src/platform/types.ts b/app/src/platform/types.ts
index 5ea4d609..23e99da5 100644
--- a/app/src/platform/types.ts
+++ b/app/src/platform/types.ts
@@ -51,6 +51,7 @@ export interface PlatformAudio {
 export interface PlatformLifecycle {
   startServer(remote?: boolean): Promise<string>;
   stopServer(): Promise<void>;
+  restartServer(): Promise<string>;
   setKeepServerRunning(keep: boolean): Promise<void>;
   setupWindowCloseHandler(): Promise<void>;
   onServerReady?: () => void;
diff --git a/backend/build_binary.py b/backend/build_binary.py
index 73f21d23..1934db3c 100644
--- a/backend/build_binary.py
+++ b/backend/build_binary.py
@@ -1,8 +1,13 @@
 """
 PyInstaller build script for creating standalone Python server binary.
+
+Usage:
+    python build_binary.py           # Build default (CPU) server binary
+    python build_binary.py --cuda    # Build CUDA-enabled server binary
 """
 
 import PyInstaller.__main__
+import argparse
 import os
 import platform
 from pathlib import Path
@@ -13,15 +18,22 @@ def is_apple_silicon():
     return platform.system() == "Darwin" and platform.machine() == "arm64"
 
 
-def build_server():
-    """Build Python server as standalone binary."""
+def build_server(cuda=False):
+    """Build Python server as standalone binary.
+
+    Args:
+        cuda: If True, build with CUDA support and name the binary
+              voicebox-server-cuda instead of voicebox-server.
+    """
     backend_dir = Path(__file__).parent
 
+    binary_name = 'voicebox-server-cuda' if cuda else 'voicebox-server'
+
     # PyInstaller arguments
     args = [
         'server.py',  # Use server.py as entry point instead of main.py
         '--onefile',
-        '--name', 'voicebox-server',
+        '--name', binary_name,
     ]
 
     # Add local qwen_tts path if specified (for editable installs)
@@ -49,6 +61,7 @@ def build_server():
         '--hidden-import', 'backend.utils.progress',
         '--hidden-import', 'backend.utils.hf_progress',
         '--hidden-import', 'backend.utils.validation',
+        '--hidden-import', 'backend.cuda_download',
         '--hidden-import', 'torch',
         '--hidden-import', 'transformers',
         '--hidden-import', 'fastapi',
@@ -70,8 +83,16 @@ def build_server():
         '--collect-submodules', 'jaraco',
     ])
 
-    # Add MLX-specific imports if building on Apple Silicon
-    if is_apple_silicon():
+    # Add CUDA-specific hidden imports
+    if cuda:
+        print("Building with CUDA support")
+        args.extend([
+            '--hidden-import', 'torch.cuda',
+            '--hidden-import', 'torch.backends.cudnn',
+        ])
+
+    # Add MLX-specific imports if building on Apple Silicon (never for CUDA builds)
+    if is_apple_silicon() and not cuda:
         print("Building for Apple Silicon - including MLX dependencies")
         args.extend([
             '--hidden-import', 'backend.backends.mlx_backend',
@@ -91,7 +112,7 @@ def build_server():
             '--collect-all', 'mlx',
             '--collect-all', 'mlx_audio',
         ])
-    else:
+    elif not cuda:
         print("Building for non-Apple Silicon platform - PyTorch only")
 
     args.extend([
@@ -105,8 +126,15 @@ def build_server():
     # Run PyInstaller
     PyInstaller.__main__.run(args)
     
-    print(f"Binary built in {backend_dir / 'dist' / 'voicebox-server'}")
+    print(f"Binary built in {backend_dir / 'dist' / binary_name}")
 
 
 if __name__ == '__main__':
-    build_server()
+    parser = argparse.ArgumentParser(description="Build voicebox-server binary")
+    parser.add_argument(
+        '--cuda',
+        action='store_true',
+        help="Build CUDA-enabled binary (voicebox-server-cuda)",
+    )
+    cli_args = parser.parse_args()
+    build_server(cuda=cli_args.cuda)
diff --git a/backend/cuda_download.py b/backend/cuda_download.py
new file mode 100644
index 00000000..51a8302d
--- /dev/null
+++ b/backend/cuda_download.py
@@ -0,0 +1,198 @@
+"""
+CUDA backend binary download, assembly, and verification.
+
+Downloads split parts of the CUDA-enabled voicebox-server binary from
+GitHub Releases, reassembles them, verifies integrity via SHA-256,
+and places the binary in the app's data directory for use on next
+backend restart.
+"""
+
+import hashlib
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Optional
+
+from .config import get_data_dir
+from .utils.progress import get_progress_manager
+from . import __version__
+
+logger = logging.getLogger(__name__)
+
+GITHUB_RELEASES_URL = "https://github.com/jamiepine/voicebox/releases/download"
+
+PROGRESS_KEY = "cuda-backend"
+
+
+def get_backends_dir() -> Path:
+    """Directory where downloaded backend binaries are stored."""
+    d = get_data_dir() / "backends"
+    d.mkdir(parents=True, exist_ok=True)
+    return d
+
+
+def get_cuda_binary_name() -> str:
+    """Platform-specific CUDA binary filename."""
+    if sys.platform == "win32":
+        return "voicebox-server-cuda.exe"
+    return "voicebox-server-cuda"
+
+
+def get_cuda_binary_path() -> Optional[Path]:
+    """Return path to CUDA binary if it exists."""
+    p = get_backends_dir() / get_cuda_binary_name()
+    if p.exists():
+        return p
+    return None
+
+
+def is_cuda_active() -> bool:
+    """Check if the current process is the CUDA binary.
+
+    The CUDA binary sets this env var on startup (see server.py).
+    """
+    return os.environ.get("VOICEBOX_BACKEND_VARIANT") == "cuda"
+
+
+def get_cuda_status() -> dict:
+    """Get current CUDA backend status for the API."""
+    progress_manager = get_progress_manager()
+    cuda_path = get_cuda_binary_path()
+    progress = progress_manager.get_progress(PROGRESS_KEY)
+
+    return {
+        "available": cuda_path is not None,
+        "active": is_cuda_active(),
+        "binary_path": str(cuda_path) if cuda_path else None,
+        "downloading": progress is not None and progress.get("status") == "downloading",
+        "download_progress": progress,
+    }
+
+
+async def download_cuda_binary(version: Optional[str] = None):
+    """Download the CUDA backend binary from GitHub Releases.
+
+    Downloads split parts listed in a manifest file, concatenates them,
+    and verifies the SHA-256 checksum for integrity. Atomic write
+    (temp file -> rename).
+
+    Args:
+        version: Version tag (e.g. "v0.2.0"). Defaults to current app version.
+    """
+    import httpx
+
+    if version is None:
+        version = f"v{__version__}"
+
+    progress = get_progress_manager()
+    binary_name = get_cuda_binary_name()
+    dest_dir = get_backends_dir()
+    final_path = dest_dir / binary_name
+    temp_path = dest_dir / f"{binary_name}.download"
+
+    # Clean up any leftover partial download
+    if temp_path.exists():
+        temp_path.unlink()
+
+    logger.info(f"Starting CUDA backend download for {version}")
+    progress.update_progress(
+        PROGRESS_KEY, current=0, total=0,
+        filename="Fetching manifest...", status="downloading",
+    )
+
+    base_url = f"{GITHUB_RELEASES_URL}/{version}"
+    stem = Path(binary_name).stem  # voicebox-server-cuda
+
+    try:
+        async with httpx.AsyncClient(follow_redirects=True, timeout=30.0) as client:
+            # Fetch the manifest (list of split part filenames)
+            manifest_url = f"{base_url}/{stem}.manifest"
+            manifest_resp = await client.get(manifest_url)
+            manifest_resp.raise_for_status()
+            parts = [p.strip() for p in manifest_resp.text.strip().splitlines() if p.strip()]
+
+            if not parts:
+                raise ValueError("Empty manifest — no split parts found")
+
+            logger.info(f"Found {len(parts)} split parts to download")
+
+            # Fetch expected checksum (optional — for integrity verification)
+            expected_sha = None
+            try:
+                sha_url = f"{base_url}/{stem}.sha256"
+                sha_resp = await client.get(sha_url)
+                if sha_resp.status_code == 200:
+                    # Format: "sha256hex  filename\n"
+                    expected_sha = sha_resp.text.strip().split()[0]
+                    logger.info(f"Expected SHA-256: {expected_sha[:16]}...")
+            except Exception as e:
+                logger.warning(f"Could not fetch checksum file — skipping verification: {e}")
+
+            # Download and concatenate parts
+            total_downloaded = 0
+            with open(temp_path, "wb") as f:
+                for i, part_name in enumerate(parts):
+                    part_url = f"{base_url}/{part_name}"
+                    logger.info(f"Downloading part {i + 1}/{len(parts)}: {part_name}")
+
+                    async with client.stream("GET", part_url) as response:
+                        response.raise_for_status()
+                        async for chunk in response.aiter_bytes(chunk_size=1024 * 1024):
+                            f.write(chunk)
+                            total_downloaded += len(chunk)
+                            progress.update_progress(
+                                PROGRESS_KEY, current=total_downloaded, total=0,
+                                filename=f"Part {i + 1}/{len(parts)}",
+                                status="downloading",
+                            )
+
+        # Verify integrity if checksum was available
+        if expected_sha:
+            progress.update_progress(
+                PROGRESS_KEY, current=total_downloaded, total=total_downloaded,
+                filename="Verifying integrity...", status="downloading",
+            )
+            sha256 = hashlib.sha256()
+            with open(temp_path, "rb") as f:
+                while True:
+                    chunk = f.read(1024 * 1024)
+                    if not chunk:
+                        break
+                    sha256.update(chunk)
+
+            actual = sha256.hexdigest()
+            if actual != expected_sha:
+                raise ValueError(
+                    f"Integrity check failed: expected {expected_sha[:16]}..., "
+                    f"got {actual[:16]}..."
+                )
+            logger.info(f"Integrity verified: {actual[:16]}...")
+
+        # Atomic move into place (replace handles existing target on all platforms)
+        temp_path.replace(final_path)
+
+        # Make executable on Unix
+        if sys.platform != "win32":
+            final_path.chmod(0o755)
+
+        logger.info(f"CUDA backend downloaded to {final_path}")
+        progress.mark_complete(PROGRESS_KEY)
+
+    except Exception as e:
+        # Clean up on failure
+        if temp_path.exists():
+            temp_path.unlink()
+        logger.error(f"CUDA backend download failed: {e}")
+        progress.mark_error(PROGRESS_KEY, str(e))
+        raise
+
+
+async def delete_cuda_binary() -> bool:
+    """Delete the downloaded CUDA binary. Returns True if deleted."""
+    path = get_cuda_binary_path()
+    if path and path.exists():
+        path.unlink()
+        logger.info(f"Deleted CUDA binary: {path}")
+        return True
+    return False
diff --git a/backend/main.py b/backend/main.py
index c9d7e7f7..0240fb97 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -206,6 +206,7 @@ async def health():
         gpu_type=gpu_type,
         vram_used_mb=vram_used,
         backend_type=backend_type,
+        backend_variant=os.environ.get("VOICEBOX_BACKEND_VARIANT", "cpu"),
     )
 
 
@@ -1797,6 +1798,75 @@ async def get_active_tasks():
     )
 
 
+# ============================================
+# CUDA BACKEND MANAGEMENT
+# ============================================
+
+@app.get("/backend/cuda-status")
+async def get_cuda_status():
+    """Get CUDA backend download/availability status."""
+    from . import cuda_download
+    return cuda_download.get_cuda_status()
+
+
+@app.post("/backend/download-cuda")
+async def download_cuda_backend():
+    """Download the CUDA backend binary. Returns immediately; track progress via SSE."""
+    from . import cuda_download
+
+    # Check if already downloaded
+    if cuda_download.get_cuda_binary_path() is not None:
+        raise HTTPException(status_code=409, detail="CUDA backend already downloaded")
+
+    async def _download():
+        try:
+            await cuda_download.download_cuda_binary()
+        except Exception as e:
+            import logging
+            logging.getLogger(__name__).error(f"CUDA download failed: {e}")
+
+    asyncio.create_task(_download())
+    return {"message": "CUDA backend download started", "progress_key": "cuda-backend"}
+
+
+@app.delete("/backend/cuda")
+async def delete_cuda_backend():
+    """Delete the downloaded CUDA backend binary."""
+    from . import cuda_download
+
+    if cuda_download.is_cuda_active():
+        raise HTTPException(
+            status_code=409,
+            detail="Cannot delete CUDA backend while it is active. Switch to CPU first.",
+        )
+
+    deleted = await cuda_download.delete_cuda_binary()
+    if not deleted:
+        raise HTTPException(status_code=404, detail="No CUDA backend found to delete")
+
+    return {"message": "CUDA backend deleted"}
+
+
+@app.get("/backend/cuda-progress")
+async def get_cuda_download_progress():
+    """Get CUDA backend download progress via Server-Sent Events."""
+    progress_manager = get_progress_manager()
+
+    async def event_generator():
+        async for event in progress_manager.subscribe("cuda-backend"):
+            yield event
+
+    return StreamingResponse(
+        event_generator(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",
+        },
+    )
+
+
 # ============================================
 # STARTUP & SHUTDOWN
 # ============================================
diff --git a/backend/models.py b/backend/models.py
index 3f55b591..3d8261b2 100644
--- a/backend/models.py
+++ b/backend/models.py
@@ -127,6 +127,7 @@ class HealthResponse(BaseModel):
     gpu_type: Optional[str] = None  # GPU type (CUDA, MPS, or None)
     vram_used_mb: Optional[float] = None
     backend_type: Optional[str] = None  # Backend type (mlx or pytorch)
+    backend_variant: Optional[str] = None  # Binary variant (cpu or cuda)
 
 
 class ModelStatus(BaseModel):
diff --git a/backend/requirements.txt b/backend/requirements.txt
index 4af5cc85..661bf6de 100644
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -20,6 +20,9 @@ soundfile>=0.12.0
 numpy>=1.24.0
 numba>=0.60.0,<0.61.0
 
+# HTTP client (for CUDA backend download)
+httpx>=0.27.0
+
 # Utilities
 python-multipart>=0.0.6
 Pillow>=10.0.0
diff --git a/backend/server.py b/backend/server.py
index b5621cd1..9a26af4a 100644
--- a/backend/server.py
+++ b/backend/server.py
@@ -64,7 +64,29 @@
             default=None,
             help="Data directory for database, profiles, and generated audio",
         )
+        parser.add_argument(
+            "--version",
+            action="store_true",
+            help="Print version and exit",
+        )
         args = parser.parse_args()
+
+        if args.version:
+            from backend import __version__
+            print(f"voicebox-server {__version__}")
+            sys.exit(0)
+
+        # Detect backend variant from binary name
+        # voicebox-server-cuda → sets VOICEBOX_BACKEND_VARIANT=cuda
+        import os
+        binary_name = os.path.basename(sys.executable).lower()
+        if "cuda" in binary_name:
+            os.environ["VOICEBOX_BACKEND_VARIANT"] = "cuda"
+            logger.info("Backend variant: CUDA")
+        else:
+            os.environ["VOICEBOX_BACKEND_VARIANT"] = "cpu"
+            logger.info("Backend variant: CPU")
+
         logger.info(f"Parsed arguments: host={args.host}, port={args.port}, data_dir={args.data_dir}")
 
         # Set data directory if provided
diff --git a/docs/plans/CUDA_BACKEND_SWAP.md b/docs/plans/CUDA_BACKEND_SWAP.md
new file mode 100644
index 00000000..b270e962
--- /dev/null
+++ b/docs/plans/CUDA_BACKEND_SWAP.md
@@ -0,0 +1,581 @@
+# CUDA Backend Swap via Binary Replacement
+
+> Status: Plan | Target: v0.2.0 | Created: 2026-03-12
+
+## Problem
+
+The CUDA PyTorch backend binary is ~2.4 GB. GitHub Releases has a 2 GB asset limit. The current release ships CPU-only PyTorch on Windows and Intel Mac — NVIDIA GPU users get no acceleration from official releases. This is the #1 reported issue category (19 open issues).
+
+Users who want GPU today must clone the repo and run from source. That's not acceptable for a desktop app targeting non-technical users.
+
+## Solution
+
+Ship two backend binaries: a default CPU build (~150 MB) bundled with the app, and a downloadable CUDA build (~2.4 GB) hosted externally. When the user downloads the CUDA build, the app kills the current backend process, swaps in the CUDA binary, and relaunches — a backend-only restart. The frontend stays running, all UI state is preserved.
+
+No subprocesses. No HTTP protocol between processes. No port allocation. No provider manager. The backend is still one monolithic process — just a different binary.
+
+## Architecture
+
+### What Exists Today
+
+```
+Tauri App
+  ├── React Frontend (in-process webview)
+  └── voicebox-server (sidecar subprocess on :17493)
+        └── One PyInstaller binary: CPU PyTorch or MLX
+```
+
+**Sidecar lifecycle** (`tauri/src-tauri/src/main.rs`):
+- `start_server` command spawns `voicebox-server` sidecar (line 181)
+- Binary located at `tauri/src-tauri/binaries/voicebox-server-{platform-triple}`
+- Tauri resolves the sidecar name via `externalBin` in `tauri.conf.json` (line 16)
+- Waits up to 120s for "Uvicorn running" in stdout/stderr (line 286)
+- `stop_server` kills the process tree (line 466)
+
+**Frontend reconnection** (`app/src/lib/hooks/useServer.ts`):
+- Health check polls `GET /health` every 30 seconds
+- React Query cache retains data for 10 minutes after disconnect
+- All UI state (Zustand stores, form data, open tabs) survives disconnection
+- No active reconnect logic — just keeps polling until server responds
+
+This means a backend restart is mostly invisible to the frontend: it sees a few seconds of failed health checks, then the server comes back. The only risk is in-flight operations (generation, transcription) failing mid-request.
+
+### What Changes
+
+```
+Tauri App
+  ├── React Frontend (in-process webview)
+  └── voicebox-server (sidecar subprocess on :17493)
+        └── One of:
+            ├── voicebox-server-cpu     (bundled, ~150 MB)
+            └── voicebox-server-cuda    (downloaded, ~2.4 GB)
+```
+
+The CUDA binary is functionally identical to the CPU binary. Same FastAPI app, same endpoints, same code. The only difference is PyTorch is compiled with CUDA 12.1 support and the binary includes CUDA runtime libraries.
+
+The user downloads it once. On every subsequent app launch, Tauri checks which binary variant exists and spawns the appropriate one.
+
+## Implementation Plan
+
+### Phase 1: Build Infrastructure
+
+Build the CUDA binary in CI separately from the main release.
+
+#### 1a. CUDA PyInstaller Build
+
+Add a `build_binary_cuda.py` or parameterize the existing `build_binary.py`:
+
+```python
+# backend/build_binary.py — add flag
+def build_server(cuda=False):
+    args = [
+        'server.py',
+        '--onefile',
+        '--name', f'voicebox-server-{"cuda" if cuda else "cpu"}',
+    ]
+
+    if cuda:
+        args.extend([
+            '--hidden-import', 'torch.cuda',
+            '--hidden-import', 'torch.backends.cudnn',
+        ])
+    # ... rest of existing build
+```
+
+The `--onefile` flag is already used, which produces a single executable. This is important — `--onedir` would complicate the swap (replacing a directory vs a file).
+
+#### 1b. CI Workflow for CUDA Binary
+
+New workflow: `.github/workflows/build-cuda.yml`
+
+```yaml
+name: Build CUDA Provider
+on:
+  workflow_dispatch:
+  push:
+    tags: ["v*"]
+
+jobs:
+  build-cuda:
+    runs-on: windows-latest  # CUDA is Windows/Linux only
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with: { python-version: "3.12" }
+      - name: Install dependencies
+        run: |
+          pip install pyinstaller
+          pip install -r backend/requirements.txt
+          pip install torch --index-url https://download.pytorch.org/whl/cu121 --force-reinstall
+      - name: Build CUDA binary
+        run: python backend/build_binary.py --cuda
+      - name: Split binary for GitHub Releases
+        run: |
+          python scripts/split_binary.py backend/dist/voicebox-server-cuda.exe \
+            --chunk-size 1900MB \
+            --output release-assets/
+      - name: Upload to R2
+        # Full binary to R2 (no size limit)
+        run: |
+          aws s3 cp backend/dist/voicebox-server-cuda.exe \
+            s3://voicebox-downloads/cuda/v${{ github.ref_name }}/voicebox-server-cuda.exe \
+            --endpoint-url ${{ secrets.R2_ENDPOINT }}
+      - name: Upload split parts to GitHub Release
+        # Split parts as GitHub Release assets (each <2 GB)
+        uses: softprops/action-gh-release@v1
+        with:
+          files: release-assets/*
+```
+
+Two distribution paths for redundancy:
+- **Cloudflare R2**: Full binary, direct download, no size limit.
+- **GitHub Releases**: Split into <2 GB chunks as fallback.
+
+#### 1c. Binary Splitting Script
+
+```python
+# scripts/split_binary.py
+"""Split a large binary into chunks for GitHub Releases."""
+import hashlib
+import argparse
+from pathlib import Path
+
+def split(input_path: Path, chunk_size: int, output_dir: Path):
+    output_dir.mkdir(parents=True, exist_ok=True)
+    data = input_path.read_bytes()
+
+    # Write SHA-256 of the complete file
+    sha256 = hashlib.sha256(data).hexdigest()
+    (output_dir / f"{input_path.stem}.sha256").write_text(
+        f"{sha256}  {input_path.name}\n"
+    )
+
+    # Split into chunks
+    parts = []
+    for i in range(0, len(data), chunk_size):
+        part_name = f"{input_path.stem}.part{len(parts):02d}{input_path.suffix}"
+        part_path = output_dir / part_name
+        part_path.write_bytes(data[i:i + chunk_size])
+        parts.append(part_name)
+
+    # Write manifest
+    (output_dir / f"{input_path.stem}.manifest").write_text(
+        "\n".join(parts) + "\n"
+    )
+
+    print(f"Split into {len(parts)} parts, SHA-256: {sha256}")
+```
+
+### Phase 2: Download & Assemble in App
+
+#### 2a. Backend Download Endpoint
+
+Add to `backend/main.py`:
+
+```python
+@app.post("/backend/download-cuda")
+async def download_cuda_backend():
+    """Download the CUDA backend binary."""
+    # Returns immediately, runs download in background
+    task = asyncio.create_task(_download_cuda_binary())
+    task.add_done_callback(lambda t: logger.error(f"CUDA download failed: {t.exception()}") if t.exception() else None)
+    return {"status": "downloading"}
+
+@app.get("/backend/cuda-status")
+async def cuda_status():
+    """Check if CUDA binary is available."""
+    cuda_path = _get_cuda_binary_path()
+    return {
+        "available": cuda_path is not None and cuda_path.exists(),
+        "active": _is_cuda_active(),
+        "download_progress": progress_manager.get_progress("cuda-backend"),
+    }
+```
+
+#### 2b. Download + Assemble + Verify Logic
+
+New file: `backend/cuda_download.py`
+
+Core logic:
+
+```python
+import hashlib
+from pathlib import Path
+from backend.config import get_data_dir
+from backend.utils.progress import get_progress_manager
+
+CUDA_DOWNLOAD_URL = "https://downloads.voicebox.sh/cuda/{version}/voicebox-server-cuda{ext}"
+CUDA_CHECKSUMS = {
+    # Populated per release
+    "0.2.0-windows": "sha256:abc123...",
+    "0.2.0-linux": "sha256:def456...",
+}
+
+def get_cuda_binary_dir() -> Path:
+    """Where CUDA binaries live. Inside the app's data directory."""
+    return get_data_dir() / "backends"
+
+def get_cuda_binary_path() -> Path | None:
+    """Return path to CUDA binary if it exists and is verified."""
+    d = get_cuda_binary_dir()
+    for name in ["voicebox-server-cuda.exe", "voicebox-server-cuda"]:
+        p = d / name
+        if p.exists():
+            return p
+    return None
+
+async def download_cuda_binary(version: str):
+    """Download, assemble (if split), and verify the CUDA binary."""
+    progress = get_progress_manager()
+    dest_dir = get_cuda_binary_dir()
+    dest_dir.mkdir(parents=True, exist_ok=True)
+
+    ext = ".exe" if sys.platform == "win32" else ""
+    url = CUDA_DOWNLOAD_URL.format(version=version, ext=ext)
+
+    # Download with progress tracking
+    temp_path = dest_dir / f"voicebox-server-cuda{ext}.download"
+    async with httpx.AsyncClient(follow_redirects=True) as client:
+        async with client.stream("GET", url) as response:
+            total = int(response.headers.get("content-length", 0))
+            downloaded = 0
+            with open(temp_path, "wb") as f:
+                async for chunk in response.aiter_bytes(chunk_size=1024 * 1024):
+                    f.write(chunk)
+                    downloaded += len(chunk)
+                    progress.update("cuda-backend", downloaded, total)
+
+    # Verify checksum
+    sha256 = hashlib.sha256(temp_path.read_bytes()).hexdigest()
+    expected = CUDA_CHECKSUMS.get(f"{version}-{sys.platform}")
+    if expected and not expected.endswith(sha256):
+        temp_path.unlink()
+        raise ValueError(f"Checksum mismatch: expected {expected}, got sha256:{sha256}")
+
+    # Atomic move into place
+    final_path = dest_dir / f"voicebox-server-cuda{ext}"
+    temp_path.rename(final_path)
+
+    # Make executable on Unix
+    if sys.platform != "win32":
+        final_path.chmod(0o755)
+
+    progress.complete("cuda-backend")
+```
+
+Key points:
+- Downloads to a `.download` temp file, verifies checksum, then atomically renames. No partial binaries left on crash.
+- Progress tracked via the existing `ProgressManager` so the frontend SSE system works unchanged.
+- CUDA binary lives in the **app data directory** (`data/backends/`), not alongside the app bundle. This avoids code-signing issues on macOS (though CUDA isn't relevant on macOS) and survives app updates.
+
+#### 2c. Reassembly from Split Parts (GitHub Releases Fallback)
+
+If the R2 download fails, fall back to downloading split parts from GitHub Releases:
+
+```python
+async def download_cuda_from_github(version: str):
+    """Fallback: download split parts from GitHub Releases, reassemble."""
+    base_url = f"https://github.com/jamiepine/voicebox/releases/download/v{version}"
+
+    # Get manifest
+    manifest_url = f"{base_url}/voicebox-server-cuda.manifest"
+    async with httpx.AsyncClient(follow_redirects=True) as client:
+        manifest = (await client.get(manifest_url)).text
+        parts = [p.strip() for p in manifest.strip().splitlines()]
+
+        # Download checksum
+        sha256_url = f"{base_url}/voicebox-server-cuda.sha256"
+        expected_sha = (await client.get(sha256_url)).text.split()[0]
+
+        # Download parts
+        dest_dir = get_cuda_binary_dir()
+        dest_dir.mkdir(parents=True, exist_ok=True)
+        temp_path = dest_dir / "voicebox-server-cuda.exe.download"
+
+        total_downloaded = 0
+        with open(temp_path, "wb") as f:
+            for i, part_name in enumerate(parts):
+                part_url = f"{base_url}/{part_name}"
+                async with client.stream("GET", part_url) as response:
+                    async for chunk in response.aiter_bytes(chunk_size=1024 * 1024):
+                        f.write(chunk)
+                        total_downloaded += len(chunk)
+                        get_progress_manager().update(
+                            "cuda-backend", total_downloaded, None,
+                            message=f"Downloading part {i+1}/{len(parts)}"
+                        )
+
+    # Verify reassembled file
+    sha256 = hashlib.sha256(temp_path.read_bytes()).hexdigest()
+    if sha256 != expected_sha:
+        temp_path.unlink()
+        raise ValueError(f"Checksum mismatch after reassembly")
+
+    final_path = dest_dir / "voicebox-server-cuda.exe"
+    temp_path.rename(final_path)
+    get_progress_manager().complete("cuda-backend")
+```
+
+### Phase 3: Backend Restart (The Swap)
+
+This is the core of the feature: kill the CPU backend, launch the CUDA backend, frontend reconnects automatically.
+
+#### 3a. New Tauri Command: `restart_server`
+
+Add to `tauri/src-tauri/src/main.rs`:
+
+```rust
+#[command]
+async fn restart_server(
+    app: tauri::AppHandle,
+    state: State<'_, ServerState>,
+    use_cuda: Option<bool>,
+) -> Result<String, String> {
+    println!("restart_server: use_cuda={:?}", use_cuda);
+
+    // 1. Stop the current server
+    stop_server(state.clone()).await?;
+
+    // 2. Brief wait for port release
+    tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
+
+    // 3. Start with the appropriate binary
+    // The start_server logic needs to check for CUDA binary
+    start_server(app, state, None).await
+}
+```
+
+#### 3b. Modify `start_server` to Prefer CUDA Binary
+
+The existing `start_server` uses `app.shell().sidecar("voicebox-server")` which resolves via Tauri's `externalBin` config. For the CUDA binary (which lives in the data directory, not the app bundle), we need an alternative launch path.
+
+Modify `start_server` in `main.rs`:
+
+```rust
+// After the existing sidecar logic, before spawning:
+
+// Check for CUDA binary in data directory
+let cuda_binary = data_dir.join("backends")
+    .join(if cfg!(windows) { "voicebox-server-cuda.exe" } else { "voicebox-server-cuda" });
+
+let (mut rx, child) = if cuda_binary.exists() {
+    println!("Found CUDA backend binary at {:?}", cuda_binary);
+
+    // Launch CUDA binary directly (not as Tauri sidecar)
+    let mut cmd = app.shell().command(cuda_binary.to_str().unwrap());
+    cmd = cmd.args([
+        "--data-dir",
+        data_dir.to_str().ok_or("Invalid data dir path")?,
+        "--port",
+        &SERVER_PORT.to_string(),
+    ]);
+    if remote.unwrap_or(false) {
+        cmd = cmd.args(["--host", "0.0.0.0"]);
+    }
+    cmd.spawn().map_err(|e| format!("Failed to spawn CUDA backend: {}", e))?
+} else {
+    // Existing sidecar launch (CPU binary bundled with app)
+    sidecar.spawn().map_err(|e| format!("Failed to spawn: {}", e))?
+};
+```
+
+Key decisions:
+- CUDA binary is launched via `app.shell().command()` (arbitrary path), not `app.shell().sidecar()` (bundled path). Tauri's sidecar system only resolves binaries within the app bundle.
+- The CUDA binary gets the same args (`--data-dir`, `--port`) as the CPU binary. It's the same `server.py` entry point.
+- Preference: if CUDA binary exists, use it. Otherwise fall back to bundled CPU. No user configuration needed.
+
+#### 3c. Frontend: Trigger Restart After Download
+
+Add to the platform lifecycle interface (`app/src/platform/types.ts`):
+
+```typescript
+interface PlatformLifecycle {
+    startServer(remote?: boolean): Promise<string>;
+    stopServer(): Promise<void>;
+    restartServer(useCuda?: boolean): Promise<string>;  // new
+    // ...
+}
+```
+
+Implement in `tauri/src/platform/lifecycle.ts`:
+
+```typescript
+async restartServer(useCuda?: boolean): Promise<string> {
+    const result = await invoke<string>('restart_server', { useCuda });
+    this.onServerReady?.();
+    return result;
+}
+```
+
+#### 3d. Frontend: GPU Settings UI
+
+Add a section to the Server Settings page (or Model Management). Minimal UI:
+
+```
+┌─────────────────────────────────────────────┐
+│  GPU Acceleration                           │
+│                                             │
+│  Status: CPU only (no CUDA backend)         │
+│                                             │
+│  [Download CUDA Backend (2.4 GB)]           │
+│                                             │
+│  Requires an NVIDIA GPU with 4+ GB VRAM.    │
+│  The app will restart its backend process   │
+│  after download. Your work is preserved.    │
+└─────────────────────────────────────────────┘
+```
+
+After download:
+
+```
+┌─────────────────────────────────────────────┐
+│  GPU Acceleration                           │
+│                                             │
+│  Status: ✓ CUDA backend active (RTX 4090)   │
+│                                             │
+│  [Switch to CPU]     [Delete CUDA Backend]  │
+└─────────────────────────────────────────────┘
+```
+
+#### 3e. Frontend: Reconnection During Restart
+
+The current health poll interval is 30 seconds — too slow for a restart UX. During a restart, temporarily increase polling:
+
+```typescript
+// In the component that triggers restart:
+const restart = async () => {
+    setRestarting(true);
+    try {
+        await platform.lifecycle.restartServer(true);
+    } catch (e) {
+        // Frontend will show "reconnecting" state
+    }
+    // Aggressively poll until health check succeeds
+    const interval = setInterval(async () => {
+        try {
+            await apiClient.getHealth();
+            clearInterval(interval);
+            setRestarting(false);
+            queryClient.invalidateQueries(); // Refresh all data
+        } catch {}
+    }, 1000); // Poll every 1s during restart
+    // Safety timeout
+    setTimeout(() => clearInterval(interval), 30000);
+};
+```
+
+### Phase 4: Auto-Detection on Startup
+
+No user action needed on subsequent launches. The preference logic in `start_server` (Phase 3b) handles this:
+
+1. App launches → `start_server` called
+2. Check `data/backends/voicebox-server-cuda{.exe}`
+3. If exists → launch CUDA binary
+4. If not → launch bundled CPU binary
+
+The user downloads CUDA once, and every future app launch (including after updates) uses it automatically. The CUDA binary lives in the data directory, not the app bundle, so app updates don't overwrite it.
+
+### Phase 5: Handling Version Mismatches
+
+When the app updates but the CUDA binary is from an older version, the API might be incompatible. Handle this by:
+
+1. Add `--version` flag to `server.py`:
+
+```python
+parser.add_argument("--version", action="store_true")
+# If invoked with --version, print version and exit
+if args.version:
+    from backend import __version__
+    print(f"voicebox-server {__version__}")
+    sys.exit(0)
+```
+
+2. In `start_server` (Rust), before launching the CUDA binary:
+
+```rust
+// Quick version check
+let version_output = std::process::Command::new(cuda_binary.to_str().unwrap())
+    .arg("--version")
+    .output();
+
+match version_output {
+    Ok(output) => {
+        let version = String::from_utf8_lossy(&output.stdout);
+        let app_version = env!("CARGO_PKG_VERSION");
+        if !version.contains(app_version) {
+            println!("CUDA binary version mismatch (app: {}, cuda: {}), falling back to CPU",
+                app_version, version.trim());
+            // Fall through to CPU sidecar launch
+        }
+    }
+    Err(_) => {
+        println!("Failed to check CUDA binary version, falling back to CPU");
+    }
+}
+```
+
+3. Frontend shows a notification: "Your GPU backend needs an update. [Download latest] or [Use CPU for now]"
+
+## Files Changed
+
+### New Files
+
+| File | Purpose |
+|------|---------|
+| `backend/cuda_download.py` | Download, reassemble, verify CUDA binary |
+| `scripts/split_binary.py` | Split binary into <2 GB chunks for GitHub Releases |
+| `.github/workflows/build-cuda.yml` | CI: build + upload CUDA binary |
+
+### Modified Files
+
+| File | Change |
+|------|--------|
+| `tauri/src-tauri/src/main.rs` | Add `restart_server` command, modify `start_server` to check for CUDA binary in data dir |
+| `backend/server.py` | Add `--version` flag |
+| `backend/main.py` | Add `/backend/download-cuda`, `/backend/cuda-status`, `/backend/progress/cuda-backend` endpoints |
+| `backend/build_binary.py` | Accept `--cuda` flag to build CUDA variant |
+| `app/src/platform/types.ts` | Add `restartServer` to lifecycle interface |
+| `tauri/src/platform/lifecycle.ts` | Implement `restartServer` |
+| `app/src/components/ServerSettings/` | New GPU acceleration section |
+| `.github/workflows/release.yml` | Trigger CUDA build workflow on tag |
+
+### NOT Changed
+
+| File | Why |
+|------|-----|
+| `backend/backends/__init__.py` | No changes to the TTSBackend singleton or factory. CUDA binary runs the same code. |
+| `backend/backends/pytorch_backend.py` | Already detects CUDA at runtime (line 28-49). No changes needed. |
+| `app/src/lib/api/client.ts` | API is identical between CPU and CUDA backends. |
+| `app/src/lib/hooks/useGenerationForm.ts` | Generation flow is unchanged. |
+
+## What This Doesn't Solve
+
+- **Multi-model support** — This is purely about GPU acceleration. LuxTTS, Chatterbox, etc. need the in-process model registry, which is an independent workstream.
+- **AMD GPU support** — DirectML/ROCm needs a different PyTorch build. Same pattern applies (another binary variant) but deferred.
+- **Linux CUDA** — Same approach works, just another CI matrix entry. Can be added in the same release or shortly after.
+- **Remote server mode** — Users who want to run TTS on a different machine still need the external provider architecture. Separate concern.
+
+## What This DOES Solve
+
+- **19 "GPU not detected" issues** — Users download the CUDA backend, restart, GPU works.
+- **2 GB GitHub Release limit** — Binary splitting + R2 hosting.
+- **Update burden** — App updates don't re-download the 2.4 GB CUDA binary. It persists in the data directory.
+- **First-run experience** — App works immediately on CPU. GPU is an optional enhancement, not a setup blocker.
+
+## Rollout Plan
+
+1. Build and test CUDA binary locally on Windows with an NVIDIA GPU.
+2. Set up R2 bucket at `downloads.voicebox.sh/cuda/`.
+3. Ship the backend restart + download UI in v0.2.0.
+4. Announce: "GPU acceleration is here — one click in Settings."
+
+## Risks
+
+| Risk | Mitigation |
+|------|-----------|
+| CUDA binary doesn't work on some GPU/driver combos | `/health` endpoint reports GPU info. Fallback to CPU if CUDA init fails. Clear error message. |
+| Antivirus flags downloaded binary (Windows) | Code-sign the CUDA binary in CI. Document AV exceptions. |
+| Data dir CUDA binary survives app uninstall | Document in uninstall notes. Not a real problem — it's just a file. |
+| Version mismatch after app update | Version check on startup (Phase 5). Auto-fallback to CPU. Prompt to re-download. |
+| R2 downtime | GitHub Releases split-binary fallback. |
+| Download interrupted | Temp file with `.download` extension. Atomic rename on completion. Resume not implemented in v1 — restart download from scratch. |
diff --git a/docs/plans/CUDA_BACKEND_SWAP_FINAL.md b/docs/plans/CUDA_BACKEND_SWAP_FINAL.md
new file mode 100644
index 00000000..ebd22534
--- /dev/null
+++ b/docs/plans/CUDA_BACKEND_SWAP_FINAL.md
@@ -0,0 +1,133 @@
+# CUDA Backend Swap — Implementation Summary
+
+> Status: **Complete** | Branch: `feat/cuda-backend-swap` | Created: 2026-03-12
+
+## What This Is
+
+A standalone feature that lets users download a CUDA-enabled backend binary (~2.4 GB) and swap it in via a backend-only restart. The frontend stays running, all UI state is preserved. This solves the #1 user pain point: 19 open issues about "GPU not detected" caused by GitHub's 2 GB release asset limit preventing CUDA binaries from shipping in official releases.
+
+## How It Works
+
+```
+User clicks "Download CUDA Backend" in Settings
+  → Backend fetches manifest from GitHub Releases
+  → Downloads split parts (<2 GB each), concatenates them
+  → SHA-256 integrity check on reassembled binary
+  → Binary placed in {app_data_dir}/backends/voicebox-server-cuda
+  → User clicks "Switch to CUDA Backend"
+  → Tauri kills CPU process, launches CUDA binary, frontend reconnects
+  → On all future app launches, CUDA binary is auto-detected and used
+```
+
+The CUDA binary is functionally identical to the CPU binary — same FastAPI app, same endpoints, same code. The only difference is PyTorch compiled with CUDA 12.1 and bundled CUDA runtime libraries.
+
+## Architecture Decisions
+
+**Backend-only restart, not full app restart.** The Tauri shell kills the current `voicebox-server` process, waits 1 second for port release, and spawns the new binary. The React frontend stays running. Health polling detects the new backend within seconds.
+
+**No provider/subprocess architecture.** This is explicitly not the PR #33 approach (10K+ lines, 136 files, 22 bugs). One process at a time. The CUDA binary replaces the CPU binary — it doesn't run alongside it.
+
+**Data directory, not app bundle.** The CUDA binary lives in `{app_data_dir}/backends/`, which persists across app updates and avoids code-signing issues. The bundled CPU binary in the app bundle is untouched.
+
+**Version mismatch protection.** On startup, Rust runs `voicebox-server-cuda --version` and compares to the app version from `tauri.conf.json`. If they don't match (e.g., after an app update), it falls back to the bundled CPU binary silently.
+
+**GitHub Releases distribution.** The CUDA binary is split into <2 GB chunks (GitHub's asset limit) via `scripts/split_binary.py`. The app downloads a manifest, fetches each part, concatenates them, and runs a SHA-256 integrity check to verify reassembly. No external hosting needed.
+
+## Files Changed
+
+### New Files
+
+| File | Lines | Purpose |
+|------|-------|---------|
+| `backend/cuda_download.py` | ~190 | Download split parts from GitHub Releases, reassemble, verify integrity |
+| `scripts/split_binary.py` | ~80 | Split large binary into <2 GB chunks with SHA-256 manifest |
+| `.github/workflows/build-cuda.yml` | ~70 | CI workflow: build CUDA binary, split, upload to GitHub Releases |
+| `app/src/components/ServerSettings/GpuAcceleration.tsx` | 371 | GPU Acceleration UI card (status, download, restart, delete) |
+| `docs/plans/CUDA_BACKEND_SWAP.md` | 581 | Original implementation plan (5 phases with code sketches) |
+| `docs/plans/CUDA_BACKEND_SWAP_FINAL.md` | this file | Final implementation summary |
+| `docs/plans/PROJECT_STATUS.md` | 462 | Full project triage (all PRs, issues, architecture) |
+| `docs/plans/PR33_CUDA_PROVIDER_REVIEW.md` | ~350 | Detailed code review of PR #33 (22 bugs documented) |
+
+### Modified Files
+
+| File | What Changed |
+|------|-------------|
+| `backend/build_binary.py` | Added `--cuda` flag, parameterized output binary name |
+| `backend/server.py` | Added `--version` flag, auto-detect backend variant from binary name (`VOICEBOX_BACKEND_VARIANT` env var) |
+| `backend/main.py` | 4 new endpoints (`/backend/cuda-status`, `/backend/download-cuda`, `/backend/cuda`, `/backend/cuda-progress`), health endpoint returns `backend_variant` |
+| `backend/models.py` | `HealthResponse` model: added `backend_variant` field |
+| `backend/requirements.txt` | Added `httpx>=0.27.0` for async HTTP downloads |
+| `tauri/src-tauri/src/main.rs` | `restart_server` command (stop → wait → start), `start_server` checks for CUDA binary in data dir and launches via `shell().command()`, version mismatch check |
+| `app/src/platform/types.ts` | `PlatformLifecycle.restartServer()` added |
+| `tauri/src/platform/lifecycle.ts` | `restartServer()` implementation via `invoke('restart_server')` |
+| `web/src/platform/lifecycle.ts` | `restartServer()` noop for web platform |
+| `app/src/lib/api/types.ts` | `CudaStatus`, `CudaDownloadProgress` interfaces; `HealthResponse` updated with `gpu_type`, `backend_type`, `backend_variant` |
+| `app/src/lib/api/client.ts` | `getCudaStatus()`, `downloadCudaBackend()`, `deleteCudaBackend()` methods |
+| `app/src/components/ServerTab/ServerTab.tsx` | Wired in `<GpuAcceleration />` component (Tauri-only) |
+
+## Backend API Endpoints
+
+| Method | Path | Purpose |
+|--------|------|---------|
+| `GET` | `/backend/cuda-status` | Returns `{ available, active, binary_path, downloading, download_progress }` |
+| `POST` | `/backend/download-cuda` | Starts background download; returns immediately. Track via SSE. |
+| `DELETE` | `/backend/cuda` | Deletes CUDA binary (blocked if CUDA is currently active) |
+| `GET` | `/backend/cuda-progress` | SSE stream of download progress (reuses existing `ProgressManager`) |
+
+The existing `GET /health` endpoint now returns two new fields:
+- `backend_type`: `"pytorch"` or `"mlx"` (existing detection)
+- `backend_variant`: `"cpu"` or `"cuda"` (set from `VOICEBOX_BACKEND_VARIANT` env var)
+
+## Frontend UI States
+
+The `GpuAcceleration` card in Server Settings handles these states:
+
+1. **Native GPU detected** (MPS, MLX, XPU, DirectML) — Shows info message, no download needed
+2. **No CUDA binary** — Download button with size estimate, description of requirements
+3. **Downloading** — SSE-driven progress bar with bytes/total and percentage
+4. **Downloaded, not active** — "Switch to CUDA Backend" button + "Remove" option
+5. **CUDA active** — Shows CUDA badge, "Switch to CPU Backend" button
+6. **Restarting** — Spinner with phase text, 1s health polling as safety net
+7. **Error** — Red error message with details
+
+### Key UX detail: switching to CPU
+
+Since `start_server` always prefers the CUDA binary if it exists on disk, "Switch to CPU" must delete the CUDA binary first, then restart. The user can re-download later. This avoids a persistent configuration mechanism (no new state to manage, no new config file, no DB column).
+
+## Rust: Server Lifecycle
+
+```
+start_server
+  ├── Check for CUDA binary at {data_dir}/backends/voicebox-server-cuda
+  ├── If found: run --version, compare to app version
+  │   ├── Match: launch via shell().command() with --data-dir, --port
+  │   └── Mismatch: log warning, fall through to CPU
+  └── Else: launch bundled sidecar via shell().sidecar()
+
+restart_server
+  ├── stop_server (kill process tree)
+  ├── wait 1 second for port release
+  └── start_server (auto-detects CUDA)
+```
+
+## What This Doesn't Cover
+
+- **AMD GPU / ROCm / DirectML binary** — Same pattern, different PyTorch build. Future PR.
+- **Linux CUDA** — Same approach, just another CI matrix entry. Can ship same release.
+- **Multi-model support** — LuxTTS, Chatterbox, etc. are a separate architectural concern (in-process model registry). Independent of binary variant.
+- **Download resume** — If download is interrupted, it restarts from scratch. Acceptable for v1.
+- **Remote server CUDA** — Users running voicebox-server on a remote machine manage their own binaries. This feature is for the desktop app.
+
+## Testing Checklist
+
+- [ ] Build CUDA binary locally with `python backend/build_binary.py --cuda`
+- [ ] `voicebox-server-cuda --version` prints correct version
+- [ ] Place CUDA binary in `{data_dir}/backends/`, launch app → auto-detects and uses it
+- [ ] Version mismatch: rename binary to have wrong version → falls back to CPU
+- [ ] Frontend: GpuAcceleration card shows correct state for CPU, CUDA available, CUDA active
+- [ ] Download flow: POST triggers download, SSE progress works, completion updates status
+- [ ] Switch to CUDA: restart works, health endpoint shows `backend_variant: "cuda"`
+- [ ] Switch to CPU: deletes binary, restarts, health shows `backend_variant: "cpu"`
+- [ ] Delete CUDA while active: returns 409 error
+- [ ] Split binary script: `python scripts/split_binary.py` creates manifest + parts + sha256
+- [ ] Native GPU (macOS MPS): shows info message, no download section
diff --git a/docs/plans/PR33_CUDA_PROVIDER_REVIEW.md b/docs/plans/PR33_CUDA_PROVIDER_REVIEW.md
new file mode 100644
index 00000000..66554dd6
--- /dev/null
+++ b/docs/plans/PR33_CUDA_PROVIDER_REVIEW.md
@@ -0,0 +1,500 @@
+# PR #33 — CUDA Provider System Review
+
+> Branch: `external-provider-binaries` | Created: 2026-02-01 | 34 commits, 136 files, +10,266 lines
+> Reviewed: 2026-03-12
+
+---
+
+## The Problem
+
+The CUDA PyTorch binary is ~2.4 GB. GitHub Releases has a 2 GB artifact limit. This means:
+
+- Windows/Linux users with NVIDIA GPUs cannot get GPU acceleration from official releases
+- 19 open issues about "GPU not detected" — the single most reported problem category
+- Users who want GPU must clone the repo and run from source
+- Every app update forces re-download of the entire binary
+
+This is the #1 user pain point by volume.
+
+---
+
+## What PR #33 Does
+
+Splits the monolithic Voicebox binary into two layers:
+
+```
+┌──────────────────────────────────────┐
+│  Main App (~150MB Win/Lin, ~300 Mac) │
+│  Tauri + React + FastAPI + Whisper   │
+│  No PyTorch. MLX bundled on macOS.   │
+├──────────────────────────────────────┤
+│           HTTP (localhost)           │
+├──────────────────────────────────────┤
+│  Provider Binary (downloaded later)  │
+│  PyTorch CPU (~300MB)                │
+│  PyTorch CUDA (~2.4GB)              │
+│  Hosted on Cloudflare R2             │
+└──────────────────────────────────────┘
+```
+
+### New Backend Code
+
+| File | Purpose |
+|------|---------|
+| `backend/providers/__init__.py` (327 lines) | `ProviderManager` — lifecycle management, subprocess spawning, port allocation |
+| `backend/providers/base.py` (97 lines) | `TTSProvider` Protocol definition |
+| `backend/providers/bundled.py` (144 lines) | `BundledProvider` — wraps existing MLX/PyTorch backends for the new interface |
+| `backend/providers/local.py` (191 lines) | `LocalProvider` — HTTP client that talks to external provider processes |
+| `backend/providers/installer.py` (262 lines) | Download, extract, delete provider binaries |
+| `backend/providers/types.py` (34 lines) | `ProviderType` enum, `ProviderInfo` dataclass |
+| `backend/providers/checksums.py` (11 lines) | Checksum dict (currently empty) |
+
+### Provider Servers (Standalone Executables)
+
+| File | Purpose |
+|------|---------|
+| `providers/pytorch-cpu/main.py` (238 lines) | FastAPI server wrapping PyTorch CPU inference |
+| `providers/pytorch-cuda/main.py` (238 lines) | FastAPI server wrapping PyTorch CUDA inference |
+| `providers/pytorch-*/build.py` | PyInstaller build scripts |
+| `providers/pytorch-*/requirements.txt` | Isolated dependencies |
+
+### Frontend
+
+| File | Purpose |
+|------|---------|
+| `app/src/components/ServerSettings/ProviderSettings.tsx` (400 lines) | Provider download/start/stop/delete UI |
+
+### Also Included (Scope Creep)
+
+The PR bundles several unrelated changes that inflate the diff:
+
+- `docs2/` — Entire documentation site rewrite (Fumadocs migration, ~3000 lines)
+- `Dockerfile`, `Dockerfile.cuda`, `docker-compose.yml` — Docker support
+- `landing/` — Banner removal
+- UI refactors in Stories, History, Voice Profiles, Audio tab
+- Linux audio capture module
+- Various dependency bumps
+
+---
+
+## Bug Report
+
+### Critical — Will Crash at Runtime
+
+#### C1. Provider `generate` endpoint can't parse requests
+
+**`providers/pytorch-cpu/main.py:91-97`** (same in pytorch-cuda)
+
+```python
+@app.post("/tts/generate")
+async def generate(
+    text: str,
+    voice_prompt: dict,
+    language: str = "auto",
+    seed: int = None,
+    model_size: str = "1.7B"
+):
+```
+
+Parameters declared as function arguments. FastAPI interprets these as **query parameters**, not JSON body. But `LocalProvider.generate()` sends a JSON body via `httpx`:
+
+```python
+# backend/providers/local.py:33-40
+response = await self.client.post("/tts/generate", json={
+    "text": text,
+    "voice_prompt": voice_prompt,
+    ...
+})
+```
+
+**Result:** Every generation call to an external provider returns HTTP 422 (Validation Error). The generation path is completely broken for external providers.
+
+**Fix:** Use a Pydantic request body model:
+```python
+class GenerateRequest(BaseModel):
+    text: str
+    voice_prompt: dict
+    language: str = "auto"
+    seed: Optional[int] = None
+    model_size: str = "1.7B"
+
+@app.post("/tts/generate")
+async def generate(data: GenerateRequest):
+```
+
+#### C2. Timeout error handler references undefined variables
+
+**`backend/providers/__init__.py:82-90`**
+
+```python
+stdout_content = ""
+stderr_content = ""
+# ... threads write to stdout_queue / stderr_queue ...
+except TimeoutError:
+    while not stdout_queue.empty():
+        stdout_lines.append(stdout_queue.get_nowait())  # NameError
+    while not stderr_queue.empty():
+        stderr_lines.append(stderr_queue.get_nowait())  # NameError
+```
+
+`stdout_lines` and `stderr_lines` are never defined. Every provider startup timeout will throw `NameError`, masking the real failure cause. Then `stdout_content` and `stderr_content` are logged but they're still empty strings — the queue data is never assigned back.
+
+#### C3. Sync `get_tts_model()` ignores external provider in async context
+
+**`backend/tts.py:15-29`**
+
+```python
+def get_tts_model():
+    manager = get_provider_manager()
+    loop = asyncio.get_event_loop()
+    if loop.is_running():
+        # We're in an async context, but can't await here
+        return manager._get_default_provider()
+```
+
+FastAPI routes are async. This function is called from several code paths during generation. In async context it **always returns the bundled provider**, ignoring whatever external provider the user selected. The user downloads and starts a CUDA provider, but generation still runs on CPU.
+
+### Critical — Security
+
+#### C4. Path traversal via `tarfile.extractall()` (CVE-2007-4559)
+
+**`backend/providers/installer.py:115-118`**
+
+```python
+with tarfile.open(archive_path, 'r:gz') as tar_ref:
+    tar_ref.extractall(providers_dir)
+```
+
+No member path filtering. A crafted `.tar.gz` from a compromised CDN can write files anywhere on disk via `../` entries. Python 3.12+ emits a deprecation warning for exactly this pattern.
+
+**Fix:**
+```python
+tar_ref.extractall(providers_dir, filter='data')  # Python 3.12+
+```
+
+Or manually validate each member:
+```python
+for member in tar_ref.getmembers():
+    member_path = os.path.join(providers_dir, member.name)
+    if not os.path.commonpath([providers_dir, member_path]).startswith(str(providers_dir)):
+        raise ValueError(f"Path traversal attempt: {member.name}")
+tar_ref.extractall(providers_dir)
+```
+
+#### C5. No checksum verification on downloaded binaries
+
+**`backend/providers/checksums.py`**
+
+```python
+PROVIDER_CHECKSUMS = {}
+```
+
+Empty dict. `download_provider()` in `installer.py` never calls any verification function. Downloaded binaries are `chmod 0o755`'d and executed without integrity checks. A MitM or CDN compromise delivers arbitrary code.
+
+**Fix:** Populate checksums per release. Verify SHA-256 after download before extraction:
+```python
+import hashlib
+sha256 = hashlib.sha256(archive_path.read_bytes()).hexdigest()
+if sha256 != expected:
+    archive_path.unlink()
+    raise ValueError(f"Checksum mismatch for {provider_type}")
+```
+
+#### C6. Provider servers have no authentication
+
+**`providers/pytorch-cpu/main.py:18-23`**
+
+```python
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    ...
+)
+```
+
+Zero auth. Any local process — including browser JavaScript via localhost — can send requests to the provider on its ephemeral port. Port is discoverable by scanning.
+
+**Fix:** Generate a random token in the parent process, pass via environment variable to the child, validate in middleware:
+```python
+# Parent (ProviderManager)
+token = secrets.token_urlsafe(32)
+env = {**os.environ, "VOICEBOX_PROVIDER_TOKEN": token}
+process = subprocess.Popen([...], env=env, ...)
+
+# Child (provider server)
+EXPECTED_TOKEN = os.environ.get("VOICEBOX_PROVIDER_TOKEN")
+
+@app.middleware("http")
+async def verify_token(request, call_next):
+    if request.headers.get("X-Provider-Token") != EXPECTED_TOKEN:
+        return JSONResponse(status_code=403, content={"error": "unauthorized"})
+    return await call_next(request)
+```
+
+### Major — Will Cause Problems in Production
+
+#### M1. Leaked file handles on subprocess stdout/stderr
+
+**`backend/providers/__init__.py:68-73`**
+
+```python
+process = subprocess.Popen(
+    [...],
+    stdout=open(stdout_log, 'w'),   # leaked handle
+    stderr=open(stderr_log, 'w'),   # leaked handle
+)
+```
+
+File handles passed directly from `open()` without storing references. They close on GC, not deterministically. On Windows the log files stay locked and unreadable until the process exits.
+
+**Fix:**
+```python
+stdout_fh = open(stdout_log, 'w')
+stderr_fh = open(stderr_log, 'w')
+try:
+    process = subprocess.Popen([...], stdout=stdout_fh, stderr=stderr_fh)
+finally:
+    stdout_fh.close()
+    stderr_fh.close()
+```
+
+#### M2. No subprocess crash detection or recovery
+
+**`backend/providers/__init__.py:56-110`**
+
+Once `start_provider()` succeeds, the `Popen` object is stored but never polled. If the provider process crashes mid-session:
+- `LocalProvider` HTTP calls fail with `httpx.ConnectError`
+- No auto-restart
+- No health-check loop
+- User sees cryptic "connection refused" errors
+- Must manually restart provider from UI
+
+**Fix:** Background asyncio task that polls `process.poll()` every few seconds. On crash, update provider status and optionally auto-restart:
+```python
+async def _watch_provider_process(self):
+    while self._provider_process and self._provider_process.poll() is None:
+        await asyncio.sleep(5)
+    if self._provider_process and self._provider_process.returncode != 0:
+        logger.error(f"Provider crashed with code {self._provider_process.returncode}")
+        self.active_provider = self._default_provider
+        # Notify frontend via next health check
+```
+
+#### M3. Port allocation race condition (TOCTOU)
+
+**`backend/providers/__init__.py:145-149`**
+
+```python
+def _get_free_port(self) -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(('', 0))
+        return s.getsockname()[1]
+    # Socket closed here — port is free but unprotected
+```
+
+Between this function returning and the provider process binding, another process can claim the port. On busy systems this causes "address already in use" failures.
+
+**Fix options:**
+- Pass the socket fd to the child process (complex, platform-specific)
+- Retry with a new port on bind failure (simplest)
+- Use a fixed port range and try sequentially
+
+#### M4. `delete_provider()` leaves hundreds of MB behind
+
+**`backend/providers/installer.py:155-168`**
+
+```python
+provider_path.unlink()  # Deletes just the executable
+```
+
+PyInstaller `--onedir` produces a directory with the executable plus all shared libraries. `unlink()` only removes the binary file, leaving behind hundreds of MB of `.so`/`.dll`/`.dylib` files.
+
+**Fix:**
+```python
+provider_dir = provider_path.parent
+shutil.rmtree(provider_dir)
+```
+
+#### M5. `LocalProvider.combine_voice_prompts()` bypasses the provider
+
+**`backend/providers/local.py:68-88`**
+
+This method imports from `..utils.audio` and processes locally instead of sending to the provider server. If the user chose an external provider because they lack local dependencies (e.g., no PyTorch on the machine), this will crash with `ImportError`.
+
+#### M6. Download errors silently swallowed
+
+**`backend/main.py:1640`**
+
+```python
+asyncio.create_task(download_provider(provider_type))
+```
+
+Fire-and-forget. If the download fails, the exception is logged as "Task exception was never retrieved." The frontend SSE progress stream may hang forever showing "downloading" without the error.
+
+**Fix:** Store the task, add an error callback:
+```python
+task = asyncio.create_task(download_provider(provider_type))
+task.add_done_callback(lambda t: t.exception() if not t.cancelled() else None)
+```
+And propagate errors through the progress manager so the SSE stream surfaces them.
+
+#### M7. `LocalProvider.is_loaded()` always returns `True`
+
+**`backend/providers/local.py:105-108`**
+
+```python
+def is_loaded(self) -> bool:
+    return True  # Return True optimistically
+```
+
+Health/status checks always report the model as loaded for external providers, even when the provider hasn't loaded anything yet. This breaks the "download model if not cached" logic in the generation flow.
+
+#### M8. `instruct` parameter silently dropped
+
+**`backend/providers/local.py:33-40`**
+
+The `generate()` method accepts `instruct` but never includes it in the JSON payload. The provider server also hardcodes `instruct=None`. Delivery instructions silently do nothing for external providers.
+
+### Minor
+
+| # | Issue | Location |
+|---|-------|----------|
+| m1 | `pytorch-cpu/main.py` and `pytorch-cuda/main.py` are 95% identical | Both files | 
+| m2 | `build.py` scripts also nearly identical | Both build files |
+| m3 | `navigator.platform` is deprecated | `ProviderSettings.tsx:20-23` |
+| m4 | `console.log('currentProvider', ...)` left in | `ProviderSettings.tsx:151` |
+| m5 | `ProviderType` enum defined but never used for validation | `types.py:10-15` |
+| m6 | `list_installed()` reimplements platform detection | `__init__.py:129-143` |
+| m7 | New `httpx.AsyncClient` created per health poll iteration | `__init__.py:151-165` |
+| m8 | `load_model_async()` only stores size, doesn't actually preload | `local.py:95-99` |
+
+---
+
+## Scope Creep
+
+The PR should be split. These are independent changes bundled in:
+
+| Change | Lines | Should Be Separate PR |
+|--------|-------|-----------------------|
+| `docs2/` site rewrite | ~3000 | Yes |
+| Docker support (Dockerfile, compose, docs) | ~600 | Yes — overlaps with PR #161 |
+| Landing page banner removal | ~30 | Yes |
+| UI refactors (Stories, History, Voices, Audio) | ~400 | Yes |
+| Linux audio capture module | ~10 | Yes |
+| Dependency bumps | ~100 | Yes |
+
+**Core provider system** (the actual feature) is ~2500 lines across backend + frontend + provider servers. That's the reviewable scope.
+
+---
+
+## What's Well-Designed
+
+These parts should survive any rewrite:
+
+1. **`TTSProvider` Protocol** (`base.py`) — Structural typing via `@runtime_checkable Protocol`. Right pattern. Comprehensive interface.
+
+2. **`BundledProvider` / `LocalProvider` split** — Clean separation between in-process and HTTP-based inference. The wrapper pattern in `BundledProvider` correctly delegates to existing `TTSBackend`.
+
+3. **R2 distribution strategy** — Provider binaries on Cloudflare R2, main app on GitHub Releases. Correct solution to the 2 GB limit.
+
+4. **Progress tracking** — SSE-based download progress integrated with the existing `ProgressManager`. Good UX.
+
+5. **Subprocess log files** — Writing provider stdout/stderr to log files in the data directory is pragmatic and debuggable.
+
+6. **Frontend `ProviderSettings.tsx`** — Clean component structure. Proper loading/disabled states, confirmation dialogs, platform-aware visibility.
+
+7. **CI split** — Separate `build-providers` and `release` jobs. Providers built and uploaded to R2 independently.
+
+---
+
+## Options for Moving Forward
+
+### Option A — Fix and Slim PR #33
+
+Strip the PR down to just the provider system (~2500 lines). Fix the 5 critical and 8 major bugs. Rebase onto current `main`.
+
+**Effort:** ~2-3 days focused work
+**Pros:** Full auto-managed provider lifecycle. Foundation for multi-model.
+**Cons:** Still complex. Process management is inherently fragile cross-platform.
+
+### Option B — Manual External Server Mode
+
+Skip subprocess management entirely. Ship a "Connect to External Server" feature:
+
+1. User downloads CUDA provider zip from `downloads.voicebox.sh`
+2. User runs it manually (`./tts-provider-pytorch-cuda --port 8100`)
+3. In Voicebox UI: paste `http://localhost:8100` as the TTS server URL
+4. Voicebox routes generation to that URL via `LocalProvider`
+
+This reuses `LocalProvider` from PR #33 but removes:
+- `ProviderManager` subprocess spawning (the buggiest part)
+- `installer.py` download/extract logic (the security risks)
+- Port allocation (user picks the port)
+- Process lifecycle management (user's responsibility)
+
+**Effort:** ~1 day. `LocalProvider` + a URL input field + health check.
+**Pros:** Simple, reliable, no process management bugs, no security surface.
+**Cons:** Manual setup. Not seamless. But CUDA users are already technical (they run from source today).
+
+### Option C — Hybrid (Recommended)
+
+Ship Option B first as v0.2.0. Then iterate toward auto-management:
+
+**Phase 1 (v0.2.0):** Manual external server mode
+- `LocalProvider` HTTP client (from PR #33, with the 422 bug fixed)
+- Server URL input in Settings
+- Health indicator
+- CUDA provider published as standalone zip on R2
+- One page of docs: "download, unzip, run, paste URL"
+
+**Phase 2 (v0.2.x):** Auto-download + auto-start
+- `installer.py` with checksum verification and safe extraction
+- `ProviderManager` subprocess spawning with crash detection
+- Provider settings UI with download/start/stop buttons
+
+**Phase 3 (v0.3.0):** Multi-model providers
+- Provider per model family (not just per hardware)
+- LuxTTS provider, Chatterbox provider, etc.
+- Provider marketplace / registry
+
+This gets CUDA into users' hands immediately (Phase 1 is ~1 day) while building toward the full vision incrementally. Each phase is independently shippable and testable.
+
+### Option D — GitHub Workaround
+
+Avoid the provider architecture entirely. Host CUDA binaries on R2 and add a download link in the app that opens the user's browser. User downloads the full monolithic CUDA build, replaces their existing install.
+
+**Effort:** Minimal — just hosting + a link.
+**Pros:** Zero architecture changes.
+**Cons:** Doesn't solve: multi-model, independent app updates, or the re-download-everything-on-update problem. Kicks the can.
+
+---
+
+## Recommendation
+
+**Option C (Hybrid)** is the strongest path. Specifically:
+
+1. **Now:** Close PR #33 as-is. It's too large, too buggy, and too stale to salvage as a single merge.
+
+2. **Extract:** Cherry-pick the good parts into small focused PRs:
+   - PR: `TTSProvider` Protocol + `BundledProvider` + `LocalProvider` (the abstractions)
+   - PR: Provider settings UI (the frontend)
+   - PR: `installer.py` + checksums (the download system)
+   - PR: CI changes for R2 upload (the distribution)
+
+3. **Ship Phase 1:** Manual external server mode. One small PR. Unblocks every CUDA user immediately.
+
+4. **Iterate:** Layer in auto-management once the manual mode is proven stable.
+
+The critical bugs in PR #33 (C1-C6) are all fixable, but the PR's size makes review unreliable. Splitting it ensures each piece gets proper attention and nothing ships broken.
+
+---
+
+## Bug Summary
+
+| Severity | Count | Blocks Ship? |
+|----------|-------|-------------|
+| Critical (runtime crash) | 3 | Yes — C1, C2, C3 |
+| Critical (security) | 3 | Yes — C4, C5, C6 |
+| Major | 8 | Some — M1, M2, M3 are high risk |
+| Minor | 8 | No |
+| **Total** | **22** | |
diff --git a/docs/plans/PROJECT_STATUS.md b/docs/plans/PROJECT_STATUS.md
new file mode 100644
index 00000000..57cafbf6
--- /dev/null
+++ b/docs/plans/PROJECT_STATUS.md
@@ -0,0 +1,462 @@
+# Voicebox Project Status & Roadmap
+
+> Last updated: 2026-03-12 | Current version: **v0.1.13** | 13.1k stars | 176 open issues | 28 open PRs
+
+---
+
+## Table of Contents
+
+1. [Architecture Overview](#architecture-overview)
+2. [Current State](#current-state)
+3. [Open PRs — Triage & Analysis](#open-prs--triage--analysis)
+4. [Open Issues — Categorized](#open-issues--categorized)
+5. [Existing Plan Documents — Status](#existing-plan-documents--status)
+6. [New Model Integration — Landscape](#new-model-integration--landscape)
+7. [Architectural Bottlenecks](#architectural-bottlenecks)
+8. [Recommended Priorities](#recommended-priorities)
+
+---
+
+## Architecture Overview
+
+```
+┌─────────────────────────────────────────────────────┐
+│  Tauri Shell (Rust)                                 │
+│  ┌───────────────────────────────────────────────┐  │
+│  │  React Frontend (app/)                        │  │
+│  │  Zustand stores · API client · Generation UI  │  │
+│  │  Stories Editor · Voice Profiles · Model Mgmt │  │
+│  └──────────────────────┬────────────────────────┘  │
+│                         │ HTTP :17493                │
+│  ┌──────────────────────▼────────────────────────┐  │
+│  │  FastAPI Backend (backend/)                   │  │
+│  │  ┌─────────────┐  ┌───────────┐  ┌─────────┐ │  │
+│  │  │ TTSBackend  │  │ STTBackend│  │ Profiles│ │  │
+│  │  │ (Protocol)  │  │ (Whisper) │  │ History │ │  │
+│  │  │  ┌────────┐ │  └───────────┘  │ Stories │ │  │
+│  │  │  │PyTorch │ │                  └─────────┘ │  │
+│  │  │  │or MLX  │ │                              │  │
+│  │  │  └────────┘ │                              │  │
+│  │  └─────────────┘                              │  │
+│  └───────────────────────────────────────────────┘  │
+└─────────────────────────────────────────────────────┘
+```
+
+### Key Files
+
+| Layer | File | Purpose |
+|-------|------|---------|
+| Backend entry | `backend/main.py` | FastAPI app, all API routes (~1700 lines) |
+| TTS protocol | `backend/backends/__init__.py:14-81` | `TTSBackend` Protocol definition |
+| TTS factory | `backend/backends/__init__.py:118-137` | Singleton backend selection (MLX vs PyTorch) |
+| PyTorch TTS | `backend/backends/pytorch_backend.py` | Qwen3-TTS via `qwen_tts` package |
+| MLX TTS | `backend/backends/mlx_backend.py` | Qwen3-TTS via `mlx_audio.tts` |
+| Platform detect | `backend/platform_detect.py` | Apple Silicon → MLX, else → PyTorch |
+| API types | `backend/models.py` | Pydantic request/response models |
+| Frontend API | `app/src/lib/api/client.ts` | Hand-written fetch wrapper |
+| Frontend types | `app/src/lib/api/types.ts` | TypeScript API types |
+| Generation form | `app/src/components/Generation/GenerationForm.tsx` | TTS generation UI |
+| Model manager | `app/src/components/ServerSettings/ModelManagement.tsx` | Model download/status UI |
+| Gen form hook | `app/src/lib/hooks/useGenerationForm.ts` | Form validation + submission |
+
+### How TTS Generation Works (Current Flow)
+
+```
+POST /generate
+  1. Look up voice profile from DB
+  2. Check model cache → if missing, trigger background download, return HTTP 202
+  3. Load model (lazy): tts_backend.load_model(model_size)
+  4. Create voice prompt: profiles.create_voice_prompt_for_profile()
+       → tts_backend.create_voice_prompt(audio_path, reference_text)
+  5. Generate: tts_backend.generate(text, voice_prompt, language, seed, instruct)
+  6. Save WAV → data/generations/{id}.wav
+  7. Insert history record in SQLite
+  8. Return GenerationResponse
+```
+
+---
+
+## Current State
+
+### What's Shipped (v0.1.13)
+
+- Qwen3-TTS voice cloning (1.7B and 0.6B models)
+- MLX backend for Apple Silicon, PyTorch for everything else
+- Voice profiles with multi-sample support
+- Stories editor (multi-track DAW timeline)
+- Whisper transcription (base, small, medium, large variants)
+- Model management UI with download progress (SSE)
+- Generation history with caching
+- Streaming generation endpoint (MLX only)
+- Delivery instructions (instruct parameter)
+
+### What's NOT Shipped But Has Code
+
+| Feature | Branch | Status |
+|---------|--------|--------|
+| External provider binaries (CUDA split) | `external-provider-binaries` | PR #33, significant work done, stale since Feb |
+| Dual server binaries | `feat/dual-server-binaries` | Branch exists, no PR |
+| Multi-sample fix | `fix-multi-sample` | Branch exists, no PR |
+| Model download notification fix | `fix-dl-notification-...` | Branch exists, no PR |
+
+### Hardcoded Qwen3-TTS Assumptions
+
+These are the specific coupling points that block multi-model support:
+
+| Location | What's Hardcoded |
+|----------|-----------------|
+| `backend/models.py:58` | `model_size` regex: `^(1\.7B\|0\.6B)$` |
+| `backend/main.py:611` | Default: `model_size or "1.7B"` |
+| `backend/main.py:1322-1365` | Model status list (2 Qwen + 4 Whisper) |
+| `backend/main.py:1523-1548` | Download trigger map |
+| `backend/main.py:1597-1628` | Delete map |
+| `backend/backends/pytorch_backend.py:65-68` | HF repo ID map |
+| `backend/backends/mlx_backend.py:41-44` | MLX repo ID map |
+| `backend/backends/__init__.py:118-137` | Single global TTS backend |
+| `app/src/lib/hooks/useGenerationForm.ts:17` | `modelSize: z.enum(['1.7B', '0.6B'])` |
+| `app/src/lib/hooks/useGenerationForm.ts:70-71` | `modelName = "qwen-tts-${data.modelSize}"` |
+| `app/src/components/Generation/GenerationForm.tsx:140-141` | Hardcoded "Qwen TTS" labels |
+| `app/src/components/ServerSettings/ModelManagement.tsx:166-213` | Filters by `qwen-tts` and `whisper` prefix |
+| `backend/utils/cache.py` | Voice prompt cache uses `torch.save()` |
+
+---
+
+## Open PRs — Triage & Analysis
+
+### Merge-Ready / Near-Ready (Bug Fixes & Small Features)
+
+| PR | Title | Risk | Notes |
+|----|-------|------|-------|
+| **#250** | docs: align local API port examples | None | Docs-only |
+| **#230** | docs: fix README grammar | None | Docs-only |
+| **#243** | a11y: screen reader and keyboard improvements | Low | Accessibility, no backend changes |
+| **#175** | Fix #134: duplicate profile name validation | Low | Simple validation |
+| **#178** | Fix #168 #140: generation error handling | Low | Error handling improvements |
+| **#152** | Fix: prevent crashes when HuggingFace unreachable | Medium | Monkey-patches HF hub; solves real offline bug (#150, #151) |
+| **#218** | fix: unify qwen tts cache dir on Windows | Low | Windows-specific path fix |
+| **#214** | fix: panic on launch from tokio::spawn | Low | Rust-side Tauri fix |
+| **#210** | fix: Linux NVIDIA GBM buffer crash | Low | Linux-specific, narrowly scoped |
+| **#88** | security: restrict CORS to known local origins | Low | Security hardening |
+
+### Significant Feature PRs
+
+| PR | Title | Complexity | Dependencies | Notes |
+|----|-------|-----------|--------------|-------|
+| **#97** | fix: pass language parameter to TTS models | Medium | None | **Critical bug** — language param was silently dropped. Adds `LANGUAGE_CODE_TO_NAME` mapping to both backends. Should be high priority. |
+| **#133** | feat: network access toggle | Low | None | Wires up existing plumbing (`--host 0.0.0.0`). Clean, small. |
+| **#238** | download cancel/clear UI + error panel | Medium | None | Adds cancel buttons, VS Code-style Problems panel, fixes whisper-large repo. Quality-of-life win. |
+| **#99** | feat: chunked TTS with quality selector | Medium | None | Solves the 500-char/2048-token limit. Sentence-aware splitting, crossfade concat, 44.1kHz upsampling. Addresses #191, #203, #69, #111. |
+| **#154** | feat: Audiobook tab | Medium | Depends on #99 concepts | Full audiobook workflow — chunked gen, preview, auto-save to Stories. New route + tab. |
+| **#91** | fix: CoreAudio device enumeration | Medium | None | macOS audio device handling. |
+
+### Architectural PRs (Need Careful Review)
+
+| PR | Title | Complexity | Notes |
+|----|-------|-----------|-------|
+| **#33** | CUDA GPU Support — External Provider Binaries | **Very High** | The big one. Splits monolithic backend into main app + downloadable provider executables (PyTorch CPU, CUDA). New provider management system, CI/CD for R2 uploads, provider settings UI. Created Feb 1, significant codebase. **This is the foundation for multi-model support** but is currently Qwen-only. |
+| **#225** | feat: custom HuggingFace model support | High | Adds `custom_models.py`, `custom:<slug>` model IDs, frontend model grouping (Built-in vs Custom). **Takes a different approach than #33** — keeps single backend but allows arbitrary HF repos. These two PRs may conflict architecturally. |
+| **#194** | feat: Hebrew + Chatterbox TTS | High | **First non-Qwen TTS model.** Adds `ChatterboxTTSBackend` alongside existing backends. Routes by language (`he` → Chatterbox, else → Qwen). Adds Hebrew Whisper models. Includes a lot of cleanup. Important precedent for multi-model. |
+| **#195** | feat: per-profile LoRA fine-tuning | **Very High** | Depends on #194. Training pipeline, adapter management, SSE progress, 15 new API endpoints. New DB tables. Forces PyTorch even on MLX systems for adapter inference. |
+| **#161** | feat: Docker + web deployment | High | 3-stage Dockerfile, SPA serving from FastAPI, docker-compose. Implements the Docker deployment plan. |
+| **#124** | Add Dockerfiles + docker-compose + docs | Medium | Earlier, simpler Docker attempt. Overlaps with #161. |
+| **#123** | added docker | Low | Minimal Docker PR. Overlaps with #161 and #124. |
+| **#227** | fix: harden input validation & file safety | Medium | Follow-up to #225. Atomic writes, threading locks, input validation. Good hardening but coupled to the custom models feature. |
+
+### PRs That Need Author Action / Are Stale
+
+| PR | Title | Notes |
+|----|-------|-------|
+| **#237** | fix: bundle qwen_tts source files in PyInstaller | Solves #212 but needs review for build system impact |
+| **#215** | Update prerequisites with Tauri deps | Branch is `main` — will have conflicts |
+| **#89** | Linux Support | Branch is `main` — will have conflicts. Broad scope. |
+| **#83** | Update download links for v0.1.12 | Outdated (we're on v0.1.13) |
+
+---
+
+## Open Issues — Categorized
+
+### GPU / Hardware Detection (19 issues)
+
+The single most reported category. Users on Windows with NVIDIA GPUs frequently report "GPU not detected."
+
+**Root causes (likely):**
+- PyInstaller binary doesn't bundle CUDA correctly → falls back to CPU
+- DirectML/Vulkan path not implemented (AMD on Windows)
+- Binary size limit means CUDA can't ship in the main release
+
+**Key issues:** #239, #222, #220, #217, #208, #198, #192, #167, #164, #141, #130, #127
+
+**Fix path:** PR #33 (external provider binaries) is designed to solve this. Ship a small main app, let users download the CUDA provider separately.
+
+### Model Downloads (20 issues)
+
+Second most reported. Users get stuck downloads, can't resume, no cancel button, no offline fallback.
+
+**Key issues:** #249, #240, #221, #216, #212, #181, #180, #159, #150, #149, #145, #143, #135, #134
+
+**Fix path:** PR #238 (cancel/clear UI), PR #152 (offline crash fix). Resume support not yet addressed.
+
+### Language Requests (18 issues)
+
+Strong demand for: Hindi (#245), Indonesian (#247), Dutch (#236), Hebrew (#199), Greek (#188), Portuguese (#183), Persian (#162), and many more.
+
+**Key issues:** #247, #245, #236, #211, #205, #199, #189, #188, #187, #183, #179, #162
+
+**Fix path:** PR #97 (pass language param — currently silently dropped!) is the prerequisite. Qwen3-TTS already supports many languages; the bug is that the language code isn't forwarded. Multi-model (#194 Chatterbox for Hebrew) expands coverage further.
+
+### New Model Requests (5 explicit issues)
+
+| Issue | Model Requested |
+|-------|----------------|
+| #226 | GGUF support |
+| #172 | VibeVoice |
+| #138 | Export to ONNX/Piper format |
+| #132 | LavaSR (transcription) |
+| #76 | (General model expansion) |
+
+Community is also vocally requesting: LuxTTS, Chatterbox, XTTS-v2, Fish Speech, CosyVoice, Kokoro on social media and in issue comments.
+
+### Long-Form / Chunking (5 issues)
+
+Users hitting the ~500 character practical limit.
+
+**Key issues:** #234 (queue system), #203 (500 char limit), #191 (auto-split), #111, #69
+
+**Fix path:** PR #99 (chunked TTS + quality selector) directly addresses this. PR #154 (Audiobook tab) builds on it.
+
+### Feature Requests (23 issues)
+
+Notable requests:
+- **#234** — Queue system for batch generation
+- **#182** — Concurrent/multi-thread generation
+- **#173** — Vocal intonation/inflection control
+- **#165** — Audiobook mode
+- **#144** — Copy text to clipboard
+- **#184** — Cancel button for progress bar
+- **#242** — Seed value pinning for consistency
+- **#228** — Always use 0.6B option
+- **#233** — Transcribe audio API improvements
+- **#235** — Finetuned Qwen3-TTS tokenizer
+
+### Bugs (19 issues)
+
+| Category | Issues |
+|----------|--------|
+| Generation failures | #248 (broken pipe), #219 (unsupported scalarType), #202 (clipping error), #170 (load failed) |
+| UI bugs | #231 (history not updating), #190 (mobile landing), #169 (blank interface) |
+| File operations | #207 (transcribe file error), #168 (no such file), #142 (download audio fail) |
+| Server lifecycle | #166 (server processes remain), #164 (no auto-update) |
+| Database | #174 (sqlite3 IntegrityError) |
+| Dependency | #131 (numpy ABI mismatch), #209 (import error) |
+
+---
+
+## Existing Plan Documents — Status
+
+| Document | Target Version | Status | Relevance |
+|----------|---------------|--------|-----------|
+| `TTS_PROVIDER_ARCHITECTURE.md` | v0.1.13 | **Partially implemented** in PR #33 | Core architecture for multi-model + CUDA distribution |
+| `EXTERNAL_PROVIDERS.md` | v0.2.0 | **Not started** | Remote server support. API path inconsistency with provider arch doc (`/v1/` vs `/tts/`) |
+| `MLX_AUDIO.md` | — | **Shipped** (the only one) | MLX backend is live. 0.6B MLX model still missing. |
+| `DOCKER_DEPLOYMENT.md` | v0.2.0 | **PR exists** (#161) | Waiting on review. No official images published. |
+| `OPENAI_SUPPORT.md` | v0.2.0 | **Not started** | OpenAI-compatible API layer. Linked to issue #10. Low complexity. |
+
+### Cross-Document Conflicts
+
+1. **API path inconsistency:** Provider arch uses `/tts/generate`, External providers uses `/v1/generate`, OpenAI compat uses `/v1/audio/speech`. Need to reconcile.
+2. **Docker vs. Provider split:** Docker doc assumes monolithic backend. Provider arch splits into separate binaries. Need to decide: does Docker run the monolith or individual providers?
+3. **Version targeting:** Provider arch targets v0.1.13 (current!) but isn't merged. Everything else targets v0.2.0.
+
+---
+
+## New Model Integration — Landscape
+
+### Models Worth Supporting (2026 SOTA)
+
+| Model | Cloning | Speed | Sample Rate | Languages | VRAM | Integration Ease | Repo |
+|-------|---------|-------|-------------|-----------|------|-----------------|------|
+| **LuxTTS** | 3s zero-shot | 150x RT, CPU ok | 48 kHz | English-first | <1 GB | Easy | `ysharma3501/LuxTTS` |
+| **Chatterbox** | 5s zero-shot | Sub-200ms streaming | 24-48 kHz | 23+ | Low | Medium | `resemble-ai/chatterbox` |
+| **XTTS-v2** | 6s zero-shot | Fast mid-GPU | 24 kHz | 17+ | Medium | Medium | `coqui/XTTS-v2` |
+| **Fish Speech** | 10-30s few-shot | Real-time | 24-44 kHz | 50+ | Medium | Medium | `fishaudio/fish-speech` |
+| **CosyVoice2-0.5B** | 3-10s zero-shot | Very fast | 24 kHz | Multilingual | Low | Easy | Alibaba HF org |
+| **Kokoro-82M** | 3s instant | CPU realtime | 24 kHz | English | Tiny | Medium | Kokoro repo |
+
+### What's Needed Architecturally for Multi-Model
+
+The current codebase assumes one TTS model family (Qwen3-TTS). Adding any new model requires:
+
+1. **Model type concept** — A `model_type` field (e.g. `qwen`, `luxtts`, `chatterbox`) alongside `model_size`. The `GenerationRequest` schema, frontend form, and all model config dicts need updating.
+
+2. **Multiple backend instances** — The singleton `get_tts_backend()` needs to become a registry. Different models have different voice prompt formats, different inference APIs, different sample rates.
+
+3. **Voice prompt format abstraction** — Qwen uses `torch.save()`-serialized tensors. LuxTTS uses `encode_prompt()` returning its own format. Chatterbox uses audio-path-based cloning. The cache system (`backend/utils/cache.py`) needs to handle heterogeneous formats.
+
+4. **Sample rate normalization** — Qwen outputs 24 kHz. LuxTTS outputs 48 kHz. The Stories editor and audio pipeline need to handle mixed rates.
+
+5. **Per-model capabilities** — Not all models support `instruct` (delivery instructions), not all support streaming, not all support the same languages. The UI needs to adapt.
+
+### PR #194 as Precedent
+
+The Hebrew/Chatterbox PR (#194) is the first attempt at multi-model. It takes a pragmatic approach: route by language (`he` → Chatterbox, else → Qwen). This works for one extra model but doesn't scale — what happens when you want Chatterbox for English too?
+
+### PR #225 as Alternative Approach
+
+The custom HuggingFace models PR (#225) takes a different angle: let users register arbitrary HF repos and attempt to load them through the existing Qwen backend. This is flexible but fragile — it assumes all models have the same API as Qwen3-TTS.
+
+### PR #33 as Foundation
+
+The external provider binaries PR (#33) has the most robust architecture for multi-model, since each provider is a separate process with its own dependencies. But it's complex, currently Qwen-only, and has been stale since early February.
+
+---
+
+## Architectural Bottlenecks
+
+### 1. Single Backend Singleton
+
+**File:** `backend/backends/__init__.py:118-137`
+
+The entire TTS system runs through one global `_tts_backend` instance. You literally cannot have two models loaded. This is the #1 blocker for multi-model support.
+
+### 2. `main.py` is 1700+ Lines
+
+All API routes, all model configs, all business logic in one file. Three separate hardcoded model config dicts that must stay in sync. Any multi-model change touches this file heavily.
+
+### 3. Model Config is Scattered
+
+Model identifiers, HF repo IDs, display names, and download logic are duplicated across:
+- `main.py` (3 separate dicts)
+- `pytorch_backend.py` (HF repo map)
+- `mlx_backend.py` (MLX repo map)
+- `GenerationForm.tsx` (UI labels)
+- `useGenerationForm.ts` (validation schema)
+- `ModelManagement.tsx` (prefix filters)
+
+There is no single source of truth for "what models does Voicebox support."
+
+### 4. Voice Prompt Cache Assumes PyTorch Tensors
+
+`backend/utils/cache.py` uses `torch.save()` / `torch.load()` for caching voice prompts. Models that don't use PyTorch tensors (LuxTTS, MLX-native models) can't use this cache.
+
+### 5. Frontend Assumes Qwen Model Sizes
+
+The generation form schema (`useGenerationForm.ts:17`) validates `model_size` as `'1.7B' | '0.6B'`. The model management UI filters by string prefix `qwen-tts`. Adding any model requires touching 3-4 frontend files.
+
+---
+
+## Recommended Priorities
+
+### Tier 1 — Ship Now (Bug Fixes & Critical Improvements)
+
+These PRs fix real user pain with low risk. Can be reviewed and merged quickly.
+
+| Priority | PR | Impact | Effort |
+|----------|-----|--------|--------|
+| 1 | **#97** — Pass language param to TTS | Fixes all non-English generation (18 language issues) | Low |
+| 2 | **#238** — Download cancel/clear UI | Addresses 20 download-related issues | Low |
+| 3 | **#152** — Offline mode crash fix | Fixes #150, #151 | Low |
+| 4 | **#99** — Chunked TTS + quality selector | Removes 500-char limit, addresses 5 issues | Medium |
+| 5 | **#218** — Windows HF cache dir fix | Windows-specific pain | Low |
+| 6 | **#175, #178** — Profile validation + error handling | Small fixes | Low |
+| 7 | **#250, #230** — Docs fixes | Zero risk | None |
+| 8 | **#133** — Network access toggle | Wires up existing code | Low |
+| 9 | **#88** — CORS restriction | Security improvement | Low |
+| 10 | **#214** — Tauri window close panic fix | Stability | Low |
+
+### Tier 2 — Next Release (v0.2.0 Foundations)
+
+These require more review but unlock major capabilities.
+
+| Priority | Item | Impact | Effort | Dependencies |
+|----------|------|--------|--------|-------------|
+| 1 | **PR #33** — External provider binaries | Solves GPU distribution (19 issues), foundation for multi-model | Very High | Needs rebase, thorough review |
+| 2 | **Multi-model abstraction layer** | Required before adding LuxTTS/Chatterbox/etc. | High | Informed by #33, #194, #225 |
+| 3 | **PR #161** — Docker deployment | Server/headless users | Medium | Independent of #33 |
+| 4 | **PR #194** — Hebrew + Chatterbox | First non-Qwen model, language expansion | High | Should align with multi-model abstraction |
+| 5 | **PR #154** — Audiobook tab | Significant feature for long-form users | Medium | Benefits from #99 (chunking) |
+
+### Tier 3 — Future (v0.3.0+)
+
+| Item | Notes |
+|------|-------|
+| LuxTTS integration | 48 kHz, low VRAM, but needs multi-model arch first |
+| XTTS-v2 / Fish Speech | Multilingual powerhouses |
+| OpenAI-compatible API (plan doc exists) | Low effort once API is stable |
+| LoRA fine-tuning (PR #195) | Complex, depends on #194 |
+| External/remote providers (plan doc exists) | Depends on provider architecture |
+| GGUF support (#226) | Depends on model ecosystem maturity |
+| Queue system (#234) | Batch generation |
+| Real-time streaming synthesis | MLX-only currently, needs PyTorch path |
+
+### Decision Point: Multi-Model Architecture
+
+Before adding any new TTS model, a decision is needed on *how*:
+
+**Option A — Provider Binary Split (PR #33 approach)**
+Each model family is a separate executable/process. Most isolated, most flexible, but most complex. Solves the CUDA distribution problem simultaneously.
+
+**Option B — In-Process Model Registry**
+Keep everything in one process but replace the singleton with a registry that can instantiate multiple `TTSBackend` implementations. Simpler, but doesn't solve binary size / CUDA distribution.
+
+**Option C — Hybrid (Recommended)**
+Use Option B for lightweight models (LuxTTS, Kokoro — small, CPU-friendly) that can coexist in-process. Use Option A for heavy models (CUDA Qwen3-TTS, Fish Speech) that need their own process/dependencies. The provider architecture from PR #33 becomes the escape hatch for heavy models, while light models are built-in.
+
+This matches how PR #194 already works (Chatterbox loaded in-process alongside Qwen) while keeping the door open for PR #33's provider split.
+
+---
+
+## Branch Inventory
+
+| Branch | PR | Status | Notes |
+|--------|-----|--------|-------|
+| `external-provider-binaries` | #33 | Open, stale | Major architecture work |
+| `feat/dual-server-binaries` | — | No PR | Related to provider split? |
+| `fix-multi-sample` | — | No PR | Voice profile multi-sample fix |
+| `fix-dl-notification-...` | — | No PR | Model download UX |
+| `improvements` | — | No PR | Unknown scope |
+| `stories` | — | No PR | Stories editor work? |
+| `windows-server-shutdown` | — | No PR | Windows lifecycle |
+| `model-dl-fix` | — | No PR | Model download fix |
+| `channels` | — | No PR | Audio channels |
+| `audio-export-entitlement-fix` | — | No PR | macOS entitlements |
+| `better-docs` | — | No PR | Documentation |
+
+---
+
+## Quick Reference: API Endpoints
+
+<details>
+<summary>All current endpoints (v0.1.13)</summary>
+
+| Endpoint | Method | Purpose |
+|----------|--------|---------|
+| `/health` | GET | Health check, model/GPU status |
+| `/profiles` | POST, GET | Create/list voice profiles |
+| `/profiles/{id}` | GET, PUT, DELETE | Profile CRUD |
+| `/profiles/{id}/samples` | POST, GET | Add/list voice samples |
+| `/profiles/{id}/avatar` | POST, GET, DELETE | Avatar management |
+| `/profiles/{id}/export` | GET | Export profile as ZIP |
+| `/profiles/import` | POST | Import profile from ZIP |
+| `/generate` | POST | Generate speech |
+| `/generate/stream` | POST | Stream speech (SSE) |
+| `/history` | GET | List generation history |
+| `/history/{id}` | GET, DELETE | Get/delete generation |
+| `/history/{id}/export` | GET | Export generation ZIP |
+| `/history/{id}/export-audio` | GET | Export audio only |
+| `/transcribe` | POST | Transcribe audio (Whisper) |
+| `/models/status` | GET | All model statuses |
+| `/models/download` | POST | Trigger model download |
+| `/models/{name}` | DELETE | Delete downloaded model |
+| `/models/load` | POST | Load model into memory |
+| `/models/unload` | POST | Unload model |
+| `/models/progress/{name}` | GET | SSE download progress |
+| `/tasks/active` | GET | Active downloads/generations |
+| `/stories` | POST, GET | Create/list stories |
+| `/stories/{id}` | GET, PUT, DELETE | Story CRUD |
+| `/stories/{id}/items` | POST, GET | Story items CRUD |
+| `/stories/{id}/export` | GET | Export story audio |
+| `/channels` | POST, GET | Audio channel CRUD |
+| `/channels/{id}` | PUT, DELETE | Channel update/delete |
+| `/cache/clear` | POST | Clear voice prompt cache |
+
+</details>
diff --git a/scripts/split_binary.py b/scripts/split_binary.py
new file mode 100644
index 00000000..0310fbd8
--- /dev/null
+++ b/scripts/split_binary.py
@@ -0,0 +1,82 @@
+"""
+Split a large binary into chunks for GitHub Releases (<2 GB each).
+
+Usage:
+    python scripts/split_binary.py backend/dist/voicebox-server-cuda.exe
+    python scripts/split_binary.py backend/dist/voicebox-server-cuda.exe --chunk-size 1900000000
+    python scripts/split_binary.py backend/dist/voicebox-server-cuda.exe --output release-assets/
+
+The script produces:
+    - voicebox-server-cuda.part00.exe, .part01.exe, ...  (binary chunks)
+    - voicebox-server-cuda.sha256      (SHA-256 checksum of the complete file)
+    - voicebox-server-cuda.manifest    (ordered list of part filenames)
+"""
+
+import argparse
+import hashlib
+import sys
+from pathlib import Path
+
+
+def split(input_path: Path, chunk_size: int, output_dir: Path):
+    output_dir.mkdir(parents=True, exist_ok=True)
+    data = input_path.read_bytes()
+    total_size = len(data)
+
+    # Write SHA-256 of the complete file
+    sha256 = hashlib.sha256(data).hexdigest()
+    checksum_file = output_dir / f"{input_path.stem}.sha256"
+    checksum_file.write_text(f"{sha256}  {input_path.name}\n")
+
+    # Split into chunks
+    parts = []
+    for i in range(0, total_size, chunk_size):
+        part_index = len(parts)
+        part_name = f"{input_path.stem}.part{part_index:02d}{input_path.suffix}"
+        part_path = output_dir / part_name
+        part_path.write_bytes(data[i:i + chunk_size])
+        parts.append(part_name)
+
+    # Write manifest (ordered list of part filenames)
+    manifest_file = output_dir / f"{input_path.stem}.manifest"
+    manifest_file.write_text("\n".join(parts) + "\n")
+
+    print(f"Input:    {input_path} ({total_size / (1024**3):.2f} GB)")
+    print(f"Output:   {output_dir}/")
+    print(f"Parts:    {len(parts)} (chunk size: {chunk_size / (1024**3):.2f} GB)")
+    print(f"SHA-256:  {sha256}")
+    print(f"Manifest: {manifest_file.name}")
+    for p in parts:
+        size = (output_dir / p).stat().st_size
+        print(f"  {p}  ({size / (1024**3):.2f} GB)")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Split a large binary into chunks for GitHub Releases"
+    )
+    parser.add_argument("input", type=Path, help="Path to the binary file to split")
+    parser.add_argument(
+        "--chunk-size",
+        type=int,
+        default=1_900_000_000,  # 1.9 GB — safely under 2 GB GitHub limit
+        help="Maximum chunk size in bytes (default: 1.9 GB)",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=None,
+        help="Output directory (default: same directory as input)",
+    )
+    args = parser.parse_args()
+
+    if not args.input.exists():
+        print(f"Error: {args.input} does not exist", file=sys.stderr)
+        sys.exit(1)
+
+    output_dir = args.output or args.input.parent
+    split(args.input, args.chunk_size, output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tauri/src-tauri/src/main.rs b/tauri/src-tauri/src/main.rs
index 255655aa..58f7b3d7 100644
--- a/tauri/src-tauri/src/main.rs
+++ b/tauri/src-tauri/src/main.rs
@@ -178,6 +178,56 @@ async fn start_server(
     println!("Data directory: {:?}", data_dir);
     println!("Remote mode: {}", remote.unwrap_or(false));
 
+    // Check for CUDA backend binary in data directory
+    let cuda_binary = {
+        let backends_dir = data_dir.join("backends");
+        let cuda_name = if cfg!(windows) {
+            "voicebox-server-cuda.exe"
+        } else {
+            "voicebox-server-cuda"
+        };
+        let path = backends_dir.join(cuda_name);
+        if path.exists() {
+            println!("Found CUDA backend binary at {:?}", path);
+
+            // Version check: run --version and compare to app version
+            let app_version = app.config().version.clone().unwrap_or_default();
+            let version_ok = match std::process::Command::new(&path)
+                .arg("--version")
+                .output()
+            {
+                Ok(output) => {
+                    // Output format: "voicebox-server X.Y.Z\n"
+                    let version_str = String::from_utf8_lossy(&output.stdout);
+                    let binary_version = version_str.trim().split_whitespace().last().unwrap_or("");
+                    if binary_version == app_version {
+                        println!("CUDA binary version {} matches app version", binary_version);
+                        true
+                    } else {
+                        println!(
+                            "CUDA binary version mismatch: binary={}, app={}. Falling back to CPU.",
+                            binary_version, app_version
+                        );
+                        false
+                    }
+                }
+                Err(e) => {
+                    println!("Failed to check CUDA binary version: {}. Falling back to CPU.", e);
+                    false
+                }
+            };
+
+            if version_ok {
+                Some(path)
+            } else {
+                None
+            }
+        } else {
+            println!("No CUDA backend found, using bundled CPU binary");
+            None
+        }
+    };
+
     let sidecar_result = app.shell().sidecar("voicebox-server");
 
     let mut sidecar = match sidecar_result {
@@ -216,22 +266,32 @@ async fn start_server(
 
     println!("Sidecar command created successfully");
 
-    // Pass data directory and port to Python server
-    sidecar = sidecar.args([
-        "--data-dir",
-        data_dir
-            .to_str()
-            .ok_or_else(|| "Invalid data dir path".to_string())?,
-        "--port",
-        &SERVER_PORT.to_string(),
-    ]);
-
-    if remote.unwrap_or(false) {
-        sidecar = sidecar.args(["--host", "0.0.0.0"]);
-    }
-
-    println!("Spawning server process...");
-    let spawn_result = sidecar.spawn();
+    // Build common args
+    let data_dir_str = data_dir
+        .to_str()
+        .ok_or_else(|| "Invalid data dir path".to_string())?
+        .to_string();
+    let port_str = SERVER_PORT.to_string();
+    let is_remote = remote.unwrap_or(false);
+
+    // If CUDA binary exists, launch it directly instead of the bundled sidecar
+    let spawn_result = if let Some(ref cuda_path) = cuda_binary {
+        println!("Launching CUDA backend: {:?}", cuda_path);
+        let mut cmd = app.shell().command(cuda_path.to_str().unwrap());
+        cmd = cmd.args(["--data-dir", &data_dir_str, "--port", &port_str]);
+        if is_remote {
+            cmd = cmd.args(["--host", "0.0.0.0"]);
+        }
+        cmd.spawn()
+    } else {
+        // Use the bundled CPU sidecar
+        sidecar = sidecar.args(["--data-dir", &data_dir_str, "--port", &port_str]);
+        if is_remote {
+            sidecar = sidecar.args(["--host", "0.0.0.0"]);
+        }
+        println!("Spawning server process...");
+        sidecar.spawn()
+    };
 
     let (mut rx, child) = match spawn_result {
         Ok(result) => result,
@@ -549,6 +609,25 @@ async fn stop_server(state: State<'_, ServerState>) -> Result<(), String> {
     Ok(())
 }
 
+#[command]
+async fn restart_server(
+    app: tauri::AppHandle,
+    state: State<'_, ServerState>,
+) -> Result<String, String> {
+    println!("restart_server: stopping current server...");
+
+    // Stop the current server
+    stop_server(state.clone()).await?;
+
+    // Wait for port to be released
+    println!("restart_server: waiting for port release...");
+    tokio::time::sleep(tokio::time::Duration::from_millis(1000)).await;
+
+    // Start server again (will auto-detect CUDA binary)
+    println!("restart_server: starting server...");
+    start_server(app, state, None).await
+}
+
 #[command]
 fn set_keep_server_running(state: State<'_, ServerState>, keep_running: bool) {
     *state.keep_running_on_close.lock().unwrap() = keep_running;
@@ -640,6 +719,7 @@ pub fn run() {
         .invoke_handler(tauri::generate_handler![
             start_server,
             stop_server,
+            restart_server,
             set_keep_server_running,
             start_system_audio_capture,
             stop_system_audio_capture,
diff --git a/tauri/src/platform/lifecycle.ts b/tauri/src/platform/lifecycle.ts
index 562c75aa..60063f3e 100644
--- a/tauri/src/platform/lifecycle.ts
+++ b/tauri/src/platform/lifecycle.ts
@@ -1,5 +1,5 @@
 import { invoke } from '@tauri-apps/api/core';
-import { listen, emit } from '@tauri-apps/api/event';
+import { emit, listen } from '@tauri-apps/api/event';
 import type { PlatformLifecycle } from '@/platform/types';
 
 class TauriLifecycle implements PlatformLifecycle {
@@ -27,6 +27,18 @@ class TauriLifecycle implements PlatformLifecycle {
     }
   }
 
+  async restartServer(): Promise<string> {
+    try {
+      const result = await invoke<string>('restart_server');
+      console.log('Server restarted:', result);
+      this.onServerReady?.();
+      return result;
+    } catch (error) {
+      console.error('Failed to restart server:', error);
+      throw error;
+    }
+  }
+
   async setKeepServerRunning(keepRunning: boolean): Promise<void> {
     try {
       await invoke('set_keep_server_running', { keepRunning });
diff --git a/web/src/platform/lifecycle.ts b/web/src/platform/lifecycle.ts
index c5e9ea6e..f40f1a90 100644
--- a/web/src/platform/lifecycle.ts
+++ b/web/src/platform/lifecycle.ts
@@ -15,6 +15,11 @@ class WebLifecycle implements PlatformLifecycle {
     // No-op for web - server is managed externally
   }
 
+  async restartServer(): Promise<string> {
+    // No-op for web - server is managed externally
+    return import.meta.env.VITE_SERVER_URL || 'http://localhost:17493';
+  }
+
   async setKeepServerRunning(_keep: boolean): Promise<void> {
     // No-op for web
   }