From 80689ad8ce53b0c7b440a8b1dc0b95239eeacaad Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sat, 31 Jan 2026 03:05:50 -0800 Subject: [PATCH 01/33] Implement TTS provider management system and update release workflow - Added support for TTS providers in the backend, including endpoints for listing, starting, stopping, and downloading providers. - Enhanced the release workflow to build and upload TTS provider binaries for both Windows and Linux platforms. - Updated the architecture documentation to reflect the new provider system and its benefits for modularity and user experience. - Introduced a new `ProviderSettings` component in the frontend for managing provider configurations. --- .github/workflows/release.yml | 144 ++++++- .../ServerSettings/ProviderSettings.tsx | 395 ++++++++++++++++++ app/src/components/ServerTab/ServerTab.tsx | 2 + app/src/lib/api/client.ts | 71 ++++ backend/build_binary.py | 32 +- backend/main.py | 215 +++++++++- backend/providers/__init__.py | 220 ++++++++++ backend/providers/base.py | 97 +++++ backend/providers/bundled.py | 139 ++++++ backend/providers/installer.py | 211 ++++++++++ backend/providers/local.py | 187 +++++++++ backend/providers/types.py | 34 ++ backend/tts.py | 52 ++- docs/plans/TTS_PROVIDER_ARCHITECTURE.md | 232 +++++----- providers/README.md | 291 +++++++++++++ providers/pytorch-cpu/build.py | 82 ++++ providers/pytorch-cpu/main.py | 238 +++++++++++ providers/pytorch-cpu/requirements.txt | 8 + providers/pytorch-cuda/build.py | 84 ++++ providers/pytorch-cuda/main.py | 238 +++++++++++ providers/pytorch-cuda/requirements.txt | 10 + 21 files changed, 2811 insertions(+), 171 deletions(-) create mode 100644 app/src/components/ServerSettings/ProviderSettings.tsx create mode 100644 backend/providers/__init__.py create mode 100644 backend/providers/base.py create mode 100644 backend/providers/bundled.py create mode 100644 backend/providers/installer.py create mode 100644 backend/providers/local.py create mode 100644 backend/providers/types.py create mode 100644 providers/README.md create mode 100644 providers/pytorch-cpu/build.py create mode 100644 providers/pytorch-cpu/main.py create mode 100644 providers/pytorch-cpu/requirements.txt create mode 100644 providers/pytorch-cuda/build.py create mode 100644 providers/pytorch-cuda/main.py create mode 100644 providers/pytorch-cuda/requirements.txt diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9e65f520..de067112 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -6,7 +6,114 @@ on: tags: - "v*" +env: + PROVIDER_VERSION: "1.0.0" + jobs: + # ============================================ + # Build TTS Providers (uploaded to R2, not GitHub) + # ============================================ + build-providers: + runs-on: ${{ matrix.platform }} + strategy: + fail-fast: false + matrix: + include: + # PyTorch CPU provider (Windows) + - platform: "windows-latest" + provider: "pytorch-cpu" + python-version: "3.12" + # PyTorch CUDA provider (Windows) - large binary, uploaded to R2 + - platform: "windows-latest" + provider: "pytorch-cuda" + python-version: "3.12" + # PyTorch CPU provider (Linux) + - platform: "ubuntu-22.04" + provider: "pytorch-cpu" + python-version: "3.12" + # PyTorch CUDA provider (Linux) - large binary, uploaded to R2 + - platform: "ubuntu-22.04" + provider: "pytorch-cuda" + python-version: "3.12" + + steps: + - uses: actions/checkout@v4 + + - name: Install dependencies (ubuntu only) + if: matrix.platform == 'ubuntu-22.04' + run: | + sudo apt-get update + sudo apt-get install -y llvm-dev + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + + - name: Install Python dependencies (CPU) + if: matrix.provider == 'pytorch-cpu' + run: | + python -m pip install --upgrade pip + pip install pyinstaller + pip install -r providers/pytorch-cpu/requirements.txt + pip install -r backend/requirements.txt + + - name: Install Python dependencies (CUDA) + if: matrix.provider == 'pytorch-cuda' + run: | + python -m pip install --upgrade pip + pip install pyinstaller + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 + pip install -r providers/pytorch-cuda/requirements.txt + pip install -r backend/requirements.txt + + - name: Build provider binary + shell: bash + run: | + cd providers/${{ matrix.provider }} + python build.py + + - name: Upload provider to R2 + shell: bash + env: + R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} + R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} + R2_ENDPOINT: ${{ secrets.R2_ENDPOINT }} + run: | + # Install AWS CLI (compatible with R2) + pip install awscli + + # Configure AWS CLI for R2 + aws configure set aws_access_key_id $R2_ACCESS_KEY_ID + aws configure set aws_secret_access_key $R2_SECRET_ACCESS_KEY + aws configure set region auto + + # Determine binary name based on platform + if [ "${{ matrix.platform }}" == "windows-latest" ]; then + BINARY_NAME="tts-provider-${{ matrix.provider }}.exe" + BINARY_PATH="providers/${{ matrix.provider }}/dist/tts-provider-${{ matrix.provider }}.exe" + else + BINARY_NAME="tts-provider-${{ matrix.provider }}" + BINARY_PATH="providers/${{ matrix.provider }}/dist/tts-provider-${{ matrix.provider }}" + fi + + # Add platform suffix for clarity + if [ "${{ matrix.platform }}" == "windows-latest" ]; then + UPLOAD_NAME="tts-provider-${{ matrix.provider }}-windows.exe" + else + UPLOAD_NAME="tts-provider-${{ matrix.provider }}-linux" + fi + + # Upload to R2 (bucket: voicebox) + aws s3 cp "$BINARY_PATH" "s3://voicebox/providers/v${{ env.PROVIDER_VERSION }}/$UPLOAD_NAME" \ + --endpoint-url "$R2_ENDPOINT" + + echo "Uploaded $UPLOAD_NAME to R2" + + # ============================================ + # Build Main App (without bundled TTS on Win/Linux) + # ============================================ release: permissions: contents: write @@ -14,22 +121,26 @@ jobs: fail-fast: false matrix: include: + # macOS Apple Silicon - MLX bundled (works out of the box) - platform: "macos-latest" args: "--target aarch64-apple-darwin" python-version: "3.12" backend: "mlx" + # macOS Intel - PyTorch bundled (smaller user base, keep simple) - platform: "macos-15-intel" args: "--target x86_64-apple-darwin" python-version: "3.12" backend: "pytorch" + # Linux - No TTS bundled, providers downloaded separately # - platform: 'ubuntu-22.04' # args: '' # python-version: '3.12' - # backend: 'pytorch' + # backend: 'none' + # Windows - No TTS bundled, providers downloaded separately - platform: "windows-latest" args: "" python-version: "3.12" - backend: "pytorch" + backend: "none" runs-on: ${{ matrix.platform }} @@ -55,23 +166,27 @@ jobs: python-version: ${{ matrix.python-version }} cache: "pip" - - name: Install Python dependencies + - name: Install Python dependencies (with TTS) + if: matrix.backend != 'none' run: | python -m pip install --upgrade pip pip install pyinstaller pip install -r backend/requirements.txt + - name: Install Python dependencies (without TTS) + if: matrix.backend == 'none' + run: | + python -m pip install --upgrade pip + pip install pyinstaller + # Install base requirements without PyTorch/Qwen-TTS + pip install fastapi uvicorn sqlalchemy librosa soundfile numpy httpx + pip install huggingface_hub # For Whisper downloads + - name: Install MLX dependencies (Apple Silicon only) if: matrix.backend == 'mlx' run: | pip install -r backend/requirements-mlx.txt - # - name: Install PyTorch with CUDA (Windows only) - # if: matrix.platform == 'windows-latest' - # run: | - # pip install torch --index-url https://download.pytorch.org/whl/cu121 --force-reinstall --no-deps - # pip install torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121 - - name: Build Python server (Linux/macOS) if: matrix.platform != 'windows-latest' run: | @@ -148,10 +263,15 @@ jobs: See the assets below to download and install this version. ### Installation - - **macOS (Apple Silicon)**: Download the `aarch64.dmg` file - uses MLX for fast native inference + - **macOS (Apple Silicon)**: Download the `aarch64.dmg` file - uses MLX for fast native inference (works out of the box) - **macOS (Intel)**: Download the `x64.dmg` file - uses PyTorch - - **Windows**: Download the `.msi` installer - - **Linux**: Download the `.AppImage` or `.deb` package + - **Windows**: Download the `.msi` installer - requires downloading a TTS provider on first use + - **Linux**: Download the `.AppImage` or `.deb` package - requires downloading a TTS provider on first use + + ### TTS Providers (Windows/Linux) + Windows and Linux users will be prompted to download a TTS provider on first launch: + - **PyTorch CPU** (~300MB) - Works on any system + - **PyTorch CUDA** (~2.4GB) - 4-5x faster on NVIDIA GPUs The app includes automatic updates - future updates will be installed automatically. releaseDraft: true diff --git a/app/src/components/ServerSettings/ProviderSettings.tsx b/app/src/components/ServerSettings/ProviderSettings.tsx new file mode 100644 index 00000000..c18874a9 --- /dev/null +++ b/app/src/components/ServerSettings/ProviderSettings.tsx @@ -0,0 +1,395 @@ +import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; +import { Download, Loader2, Trash2 } from 'lucide-react'; +import { useCallback, useState } from 'react'; +import { + AlertDialog, + AlertDialogAction, + AlertDialogCancel, + AlertDialogContent, + AlertDialogDescription, + AlertDialogFooter, + AlertDialogHeader, + AlertDialogTitle, +} from '@/components/ui/alert-dialog'; +import { Badge } from '@/components/ui/badge'; +import { Button } from '@/components/ui/button'; +import { Card, CardContent, CardDescription, CardHeader, CardTitle } from '@/components/ui/card'; +import { Label } from '@/components/ui/label'; +import { RadioGroup, RadioGroupItem } from '@/components/ui/radio-group'; +import { useToast } from '@/components/ui/use-toast'; +import { apiClient } from '@/lib/api/client'; +import { useModelDownloadToast } from '@/lib/hooks/useModelDownloadToast'; + +const isMacOS = () => navigator.platform.toLowerCase().includes('mac'); + +type ProviderType = 'auto' | 'bundled-mlx' | 'bundled-pytorch' | 'pytorch-cpu' | 'pytorch-cuda' | 'remote' | 'openai'; + +export function ProviderSettings() { + const { toast } = useToast(); + const queryClient = useQueryClient(); + const [selectedProvider, setSelectedProvider] = useState('auto'); + const [downloadingProvider, setDownloadingProvider] = useState(null); + + const { data: providersData, isLoading } = useQuery({ + queryKey: ['providers'], + queryFn: async () => { + return await apiClient.listProviders(); + }, + refetchInterval: 5000, + }); + + const { data: activeProvider } = useQuery({ + queryKey: ['activeProvider'], + queryFn: async () => { + return await apiClient.getActiveProvider(); + }, + refetchInterval: 5000, + }); + + // Callbacks for download completion + const handleDownloadComplete = useCallback(() => { + setDownloadingProvider(null); + queryClient.invalidateQueries({ queryKey: ['providers'] }); + }, [queryClient]); + + const handleDownloadError = useCallback(() => { + setDownloadingProvider(null); + }, []); + + // Use progress toast hook for the downloading provider + useModelDownloadToast({ + modelName: downloadingProvider || '', + displayName: downloadingProvider || '', + enabled: !!downloadingProvider, + onComplete: handleDownloadComplete, + onError: handleDownloadError, + }); + + const [deleteDialogOpen, setDeleteDialogOpen] = useState(false); + const [providerToDelete, setProviderToDelete] = useState(null); + + const downloadMutation = useMutation({ + mutationFn: async (providerType: string) => { + return await apiClient.downloadProvider(providerType); + }, + onSuccess: (_, providerType) => { + setDownloadingProvider(providerType); + queryClient.invalidateQueries({ queryKey: ['providers'] }); + }, + onError: (error: Error) => { + toast({ + title: 'Download failed', + description: error.message, + variant: 'destructive', + }); + }, + }); + + const startMutation = useMutation({ + mutationFn: async (providerType: string) => { + return await apiClient.startProvider(providerType); + }, + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['activeProvider'] }); + toast({ + title: 'Provider started', + description: 'The provider has been started successfully', + }); + }, + onError: (error: Error) => { + toast({ + title: 'Failed to start provider', + description: error.message, + variant: 'destructive', + }); + }, + }); + + const deleteMutation = useMutation({ + mutationFn: async (providerType: string) => { + return await apiClient.deleteProvider(providerType); + }, + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['providers'] }); + toast({ + title: 'Provider deleted', + description: 'The provider has been deleted successfully', + }); + }, + onError: (error: Error) => { + toast({ + title: 'Failed to delete provider', + description: error.message, + variant: 'destructive', + }); + }, + }); + + const handleDownload = async (providerType: string) => { + downloadMutation.mutate(providerType); + }; + + const handleStart = async (providerType: string) => { + startMutation.mutate(providerType); + }; + + const handleDelete = (providerType: string) => { + setProviderToDelete(providerType); + setDeleteDialogOpen(true); + }; + + const confirmDelete = () => { + if (providerToDelete) { + deleteMutation.mutate(providerToDelete); + setDeleteDialogOpen(false); + setProviderToDelete(null); + } + }; + + if (isLoading) { + return ( + + + TTS Provider + Choose how Voicebox generates speech + + +
+ +
+
+
+ ); + } + + const installedProviders = providersData?.installed || []; + + // Determine current active provider + const currentProvider = activeProvider?.provider || 'auto'; + + return ( + <> + + + TTS Provider + Choose how Voicebox generates speech + + + setSelectedProvider(value as ProviderType)} + > + {/* Auto-detect */} +
+ + + {currentProvider === 'auto' && ( + + Active + + )} +
+ + {/* PyTorch CUDA */} +
+
+ + +
+
+ {currentProvider === 'pytorch-cuda' && ( + Active + )} + {!installedProviders.includes('pytorch-cuda') && ( + + )} + {installedProviders.includes('pytorch-cuda') && selectedProvider !== 'pytorch-cuda' && ( + + )} + {installedProviders.includes('pytorch-cuda') && ( + + )} +
+
+ + {/* PyTorch CPU (Windows/Linux only) */} + {!isMacOS() && ( +
+
+ + +
+
+ {currentProvider === 'pytorch-cpu' && ( + Active + )} + {!installedProviders.includes('pytorch-cpu') && ( + + )} + {installedProviders.includes('pytorch-cpu') && selectedProvider !== 'pytorch-cpu' && ( + + )} + {installedProviders.includes('pytorch-cpu') && ( + + )} +
+
+ )} + + {/* MLX bundled (macOS only) */} + {isMacOS() && ( +
+
+
+ MLX (Apple Silicon) + {currentProvider === 'bundled-mlx' && ( + Active + )} +
+
+ Bundled with the app - optimized for M1/M2/M3 chips +
+
+
+ )} + + {/* Remote */} +
+
+ + +
+ {selectedProvider === 'remote' && ( +
+ +
+ Remote provider support coming soon +
+
+ )} +
+ + {/* OpenAI */} +
+
+ + +
+ {selectedProvider === 'openai' && ( +
+ +
+ OpenAI provider support coming soon +
+
+ )} +
+
+
+
+ + + + + Delete Provider + + Are you sure you want to delete {providerToDelete}? This will remove the provider + binary from your system. You can download it again later if needed. + + + + Cancel + + Delete + + + + + + ); +} diff --git a/app/src/components/ServerTab/ServerTab.tsx b/app/src/components/ServerTab/ServerTab.tsx index abf91ac2..5a512027 100644 --- a/app/src/components/ServerTab/ServerTab.tsx +++ b/app/src/components/ServerTab/ServerTab.tsx @@ -1,6 +1,7 @@ import { ConnectionForm } from '@/components/ServerSettings/ConnectionForm'; import { ServerStatus } from '@/components/ServerSettings/ServerStatus'; import { UpdateStatus } from '@/components/ServerSettings/UpdateStatus'; +import { ProviderSettings } from '@/components/ServerSettings/ProviderSettings'; import { usePlatform } from '@/platform/PlatformContext'; export function ServerTab() { @@ -11,6 +12,7 @@ export function ServerTab() { + {platform.metadata.isTauri && }
Created by{' '} diff --git a/app/src/lib/api/client.ts b/app/src/lib/api/client.ts index c5b079b2..b5da41a6 100644 --- a/app/src/lib/api/client.ts +++ b/app/src/lib/api/client.ts @@ -199,6 +199,77 @@ class ApiClient { }); } + // Providers + async listProviders(): Promise<{ + providers: Array<{ + type: string; + name: string; + installed: boolean; + size_mb: number | null; + }>; + installed: string[]; + }> { + return this.request('/providers'); + } + + async getActiveProvider(): Promise<{ + provider: string; + health: { + status: string; + provider: string; + version: string | null; + model: string | null; + device: string | null; + }; + status: { + model_loaded: boolean; + model_size: string | null; + available_sizes: string[]; + gpu_available: boolean | null; + vram_used_mb: number | null; + }; + }> { + return this.request('/providers/active'); + } + + async startProvider(providerType: string): Promise<{ + message: string; + provider: { + status: string; + provider: string; + version: string | null; + model: string | null; + device: string | null; + }; + }> { + return this.request('/providers/start', { + method: 'POST', + body: JSON.stringify({ provider_type: providerType }), + }); + } + + async stopProvider(): Promise<{ message: string }> { + return this.request('/providers/stop', { + method: 'POST', + }); + } + + async downloadProvider(providerType: string): Promise<{ + message: string; + provider_type: string; + }> { + return this.request('/providers/download', { + method: 'POST', + body: JSON.stringify({ provider_type: providerType }), + }); + } + + async deleteProvider(providerType: string): Promise<{ message: string }> { + return this.request(`/providers/${providerType}`, { + method: 'DELETE', + }); + } + // History async listHistory(query?: HistoryQuery): Promise { const params = new URLSearchParams(); diff --git a/backend/build_binary.py b/backend/build_binary.py index a2973cd4..fb65863e 100644 --- a/backend/build_binary.py +++ b/backend/build_binary.py @@ -30,7 +30,7 @@ def build_server(): args.extend(['--paths', str(qwen_tts_path)]) print(f"Using local qwen_tts source from: {qwen_tts_path}") - # Add common hidden imports + # Add common hidden imports (always included) args.extend([ '--hidden-import', 'backend', '--hidden-import', 'backend.main', @@ -42,38 +42,30 @@ def build_server(): '--hidden-import', 'backend.tts', '--hidden-import', 'backend.transcribe', '--hidden-import', 'backend.platform_detect', - '--hidden-import', 'backend.backends', - '--hidden-import', 'backend.backends.pytorch_backend', + '--hidden-import', 'backend.providers', + '--hidden-import', 'backend.providers.base', + '--hidden-import', 'backend.providers.bundled', + '--hidden-import', 'backend.providers.types', '--hidden-import', 'backend.utils.audio', '--hidden-import', 'backend.utils.cache', '--hidden-import', 'backend.utils.progress', '--hidden-import', 'backend.utils.hf_progress', '--hidden-import', 'backend.utils.validation', - '--hidden-import', 'torch', - '--hidden-import', 'transformers', '--hidden-import', 'fastapi', '--hidden-import', 'uvicorn', '--hidden-import', 'sqlalchemy', '--hidden-import', 'librosa', '--hidden-import', 'soundfile', - '--hidden-import', 'qwen_tts', - '--hidden-import', 'qwen_tts.inference', - '--hidden-import', 'qwen_tts.inference.qwen3_tts_model', - '--hidden-import', 'qwen_tts.inference.qwen3_tts_tokenizer', - '--hidden-import', 'qwen_tts.core', - '--hidden-import', 'qwen_tts.cli', - '--copy-metadata', 'qwen-tts', - '--collect-submodules', 'qwen_tts', - '--collect-data', 'qwen_tts', # Fix for pkg_resources and jaraco namespace packages '--hidden-import', 'pkg_resources.extern', '--collect-submodules', 'jaraco', ]) - # Add MLX-specific imports if building on Apple Silicon + # Platform-specific TTS backend handling if is_apple_silicon(): - print("Building for Apple Silicon - including MLX dependencies") + print("Building for Apple Silicon - including MLX dependencies (bundled)") args.extend([ + '--hidden-import', 'backend.backends', '--hidden-import', 'backend.backends.mlx_backend', '--hidden-import', 'mlx', '--hidden-import', 'mlx.core', @@ -88,7 +80,13 @@ def build_server(): '--collect-data', 'mlx_audio', ]) else: - print("Building for non-Apple Silicon platform - PyTorch only") + print("Building for Windows/Linux - excluding PyTorch/Qwen-TTS (providers downloaded separately)") + # Note: PyTorch and Qwen-TTS are NOT included - users will download providers separately + # Only include backend abstraction (no actual TTS implementation) + args.extend([ + '--hidden-import', 'backend.backends', + '--hidden-import', 'backend.backends.pytorch_backend', # Keep for reference, but won't work without PyTorch + ]) args.extend([ '--noconfirm', diff --git a/backend/main.py b/backend/main.py index 59fb9e18..3bd4b7e6 100644 --- a/backend/main.py +++ b/backend/main.py @@ -29,6 +29,8 @@ from .utils.tasks import get_task_manager from .utils.cache import clear_voice_prompt_cache from .platform_detect import get_backend_type +from .providers import get_provider_manager +from .providers.types import ProviderType app = FastAPI( title="voicebox API", @@ -74,7 +76,7 @@ async def health(): from pathlib import Path import os - tts_model = tts.get_tts_model() + tts_model = await tts.get_tts_model_async() backend_type = get_backend_type() # Check for GPU availability (CUDA or MPS) @@ -549,7 +551,7 @@ async def generate_speech( ) # Generate audio - tts_model = tts.get_tts_model() + tts_model = await tts.get_tts_model_async() # Load the requested model size if different from current (async to not block) model_size = data.model_size or "1.7B" @@ -1113,8 +1115,8 @@ async def get_sample_audio(sample_id: str, db: Session = Depends(get_db)): async def load_model(model_size: str = "1.7B"): """Manually load TTS model.""" try: - tts_model = tts.get_tts_model() - await tts_model.load_model_async(model_size) + tts_model = await tts.get_tts_model_async() + await tts_model.load_model(model_size) return {"message": f"Model {model_size} loaded successfully"} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @@ -1172,10 +1174,10 @@ async def get_model_status(): except ImportError: use_scan_cache = False - def check_tts_loaded(model_size: str): + async def check_tts_loaded(model_size: str): """Check if TTS model is loaded with specific size.""" try: - tts_model = tts.get_tts_model() + tts_model = await tts.get_tts_model_async() return tts_model.is_loaded() and getattr(tts_model, 'model_size', None) == model_size except Exception: return False @@ -1211,14 +1213,14 @@ def check_whisper_loaded(model_size: str): "display_name": "Qwen TTS 1.7B", "hf_repo_id": tts_1_7b_id, "model_size": "1.7B", - "check_loaded": lambda: check_tts_loaded("1.7B"), + "check_loaded": lambda: check_tts_loaded("1.7B"), # Async function }, { "model_name": "qwen-tts-0.6B", "display_name": "Qwen TTS 0.6B", "hf_repo_id": tts_0_6b_id, "model_size": "0.6B", - "check_loaded": lambda: check_tts_loaded("0.6B"), + "check_loaded": lambda: check_tts_loaded("0.6B"), # Async function }, { "model_name": "whisper-base", @@ -1356,7 +1358,11 @@ def check_whisper_loaded(model_size: str): # Check if loaded in memory try: - loaded = config["check_loaded"]() + check_func = config["check_loaded"] + if asyncio.iscoroutinefunction(check_func): + loaded = await check_func() + else: + loaded = check_func() except Exception: loaded = False @@ -1379,7 +1385,11 @@ def check_whisper_loaded(model_size: str): except Exception as e: # If check fails, try to at least check if loaded try: - loaded = config["check_loaded"]() + check_func = config["check_loaded"] + if asyncio.iscoroutinefunction(check_func): + loaded = await check_func() + else: + loaded = check_func() except Exception: loaded = False @@ -1406,14 +1416,24 @@ async def trigger_model_download(request: models.ModelDownloadRequest): task_manager = get_task_manager() progress_manager = get_progress_manager() + async def load_tts_model_1_7b(): + """Load 1.7B TTS model.""" + tts_model = await tts.get_tts_model_async() + await tts_model.load_model("1.7B") + + async def load_tts_model_0_6b(): + """Load 0.6B TTS model.""" + tts_model = await tts.get_tts_model_async() + await tts_model.load_model("0.6B") + model_configs = { "qwen-tts-1.7B": { "model_size": "1.7B", - "load_func": lambda: tts.get_tts_model().load_model("1.7B"), + "load_func": load_tts_model_1_7b, }, "qwen-tts-0.6B": { "model_size": "0.6B", - "load_func": lambda: tts.get_tts_model().load_model("0.6B"), + "load_func": load_tts_model_0_6b, }, "whisper-base": { "model_size": "base", @@ -1472,6 +1492,171 @@ async def download_in_background(): return {"message": f"Model {request.model_name} download started"} +# ============================================ +# PROVIDER ENDPOINTS +# ============================================ + +@app.get("/providers") +async def list_providers(): + """List all available provider types.""" + manager = get_provider_manager() + installed = await manager.list_installed() + + # Get info for all known provider types + all_providers = [ + "bundled-mlx", + "bundled-pytorch", + "pytorch-cpu", + "pytorch-cuda", + "remote", + "openai", + ] + + providers_info = [] + for provider_type in all_providers: + info = await manager.get_provider_info(provider_type) + providers_info.append(info) + + return { + "providers": providers_info, + "installed": installed, + } + + +@app.get("/providers/installed") +async def list_installed_providers(): + """List installed provider types.""" + manager = get_provider_manager() + installed = await manager.list_installed() + return {"installed": installed} + + +@app.get("/providers/active") +async def get_active_provider(): + """Get information about the currently active provider.""" + manager = get_provider_manager() + provider = await manager.get_active_provider() + + health = await provider.health() + status = await provider.status() + + return { + "provider": health["provider"], + "health": health, + "status": status, + } + + +@app.post("/providers/start") +async def start_provider(data: dict): + """Start a specific provider.""" + provider_type = data.get("provider_type") + if not provider_type: + raise HTTPException(status_code=400, detail="provider_type is required") + + manager = get_provider_manager() + try: + await manager.start_provider(provider_type) + provider = await manager.get_active_provider() + health = await provider.health() + return { + "message": f"Provider {provider_type} started", + "provider": health, + } + except NotImplementedError as e: + raise HTTPException(status_code=501, detail=str(e)) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/providers/stop") +async def stop_provider(): + """Stop the currently active provider.""" + manager = get_provider_manager() + await manager.stop_provider() + return {"message": "Provider stopped"} + + +@app.post("/providers/download") +async def download_provider_endpoint(data: dict): + """Download a provider binary.""" + from .providers.installer import download_provider + + provider_type = data.get("provider_type") + if not provider_type: + raise HTTPException(status_code=400, detail="provider_type is required") + + if provider_type not in ["pytorch-cpu", "pytorch-cuda"]: + raise HTTPException( + status_code=400, + detail=f"Provider type {provider_type} cannot be downloaded" + ) + + try: + # Start download in background + asyncio.create_task(download_provider(provider_type)) + return { + "message": f"Provider {provider_type} download started", + "provider_type": provider_type, + } + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@app.get("/providers/download/progress/{provider_type}") +async def get_provider_download_progress(provider_type: str): + """Get provider download progress via Server-Sent Events.""" + from fastapi.responses import StreamingResponse + from .utils.progress import get_progress_manager + + progress_manager = get_progress_manager() + + async def event_generator(): + """Generate SSE events for provider download progress.""" + import asyncio + import json + + last_progress = None + + while True: + progress = progress_manager.get_progress(provider_type) + + if progress and progress != last_progress: + yield f"data: {json.dumps(progress)}\n\n" + last_progress = progress + + if progress.get("status") in ["complete", "error"]: + break + + await asyncio.sleep(0.5) + + return StreamingResponse(event_generator(), media_type="text/event-stream") + + +@app.delete("/providers/{provider_type}") +async def delete_provider_endpoint(provider_type: str): + """Delete an installed provider.""" + from .providers.installer import delete_provider + + if provider_type not in ["pytorch-cpu", "pytorch-cuda"]: + raise HTTPException( + status_code=400, + detail=f"Provider type {provider_type} cannot be deleted" + ) + + deleted = delete_provider(provider_type) + + if deleted: + return {"message": f"Provider {provider_type} deleted successfully"} + else: + raise HTTPException( + status_code=404, + detail=f"Provider {provider_type} not found" + ) + + @app.delete("/models/{model_name}") async def delete_model(model_name: str): """Delete a downloaded model from the HuggingFace cache.""" @@ -1522,9 +1707,9 @@ async def delete_model(model_name: str): try: # Check if model is loaded and unload it first if config["model_type"] == "tts": - tts_model = tts.get_tts_model() - if tts_model.is_loaded() and tts_model.model_size == config["model_size"]: - tts.unload_tts_model() + tts_model = await tts.get_tts_model_async() + if tts_model.is_loaded() and getattr(tts_model, 'model_size', None) == config["model_size"]: + tts_model.unload_model() elif config["model_type"] == "whisper": whisper_model = transcribe.get_whisper_model() if whisper_model.is_loaded() and whisper_model.model_size == config["model_size"]: diff --git a/backend/providers/__init__.py b/backend/providers/__init__.py new file mode 100644 index 00000000..09854d77 --- /dev/null +++ b/backend/providers/__init__.py @@ -0,0 +1,220 @@ +""" +Provider management system for TTS providers. +""" + +from typing import Optional +import platform +from pathlib import Path + +from .base import TTSProvider +from .types import ProviderType +from .bundled import BundledProvider +from .local import LocalProvider +from .installer import get_provider_binary_path +from ..config import get_data_dir +import subprocess +import socket + + +class ProviderManager: + """Manages TTS provider lifecycle.""" + + def __init__(self): + self.active_provider: Optional[TTSProvider] = None + self._default_provider: Optional[TTSProvider] = None + self._provider_process: Optional[subprocess.Popen] = None + self._provider_port: Optional[int] = None + + def _get_default_provider(self) -> TTSProvider: + """Get the default bundled provider.""" + if self._default_provider is None: + self._default_provider = BundledProvider() + return self._default_provider + + async def get_active_provider(self) -> TTSProvider: + """ + Get the currently active provider. + + Returns: + Active TTS provider instance + """ + if self.active_provider is None: + # Default to bundled provider + self.active_provider = self._get_default_provider() + return self.active_provider + + async def start_provider(self, provider_type: str) -> None: + """ + Start a TTS provider. + + Args: + provider_type: Type of provider to start + """ + if provider_type in ["bundled-mlx", "bundled-pytorch"]: + # Use bundled provider + self.active_provider = self._get_default_provider() + elif provider_type in ["pytorch-cpu", "pytorch-cuda"]: + # Start local provider subprocess + provider_path = get_provider_binary_path(provider_type) + if not provider_path or not provider_path.exists(): + raise ValueError(f"Provider {provider_type} is not installed. Please download it first.") + + # Find a free port + port = self._get_free_port() + + # Start provider subprocess + from ..config import get_data_dir + process = subprocess.Popen( + [ + str(provider_path), + "--port", str(port), + "--data-dir", str(get_data_dir()), + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + # Wait for provider to be ready + base_url = f"http://127.0.0.1:{port}" + await self._wait_for_provider_health(base_url, timeout=30) + + # Create LocalProvider instance + self.active_provider = LocalProvider(base_url) + self._provider_process = process + self._provider_port = port + elif provider_type == "remote": + # Remote provider - will be implemented in Phase 5 + raise NotImplementedError("Remote provider not yet implemented") + elif provider_type == "openai": + # OpenAI provider - will be implemented in Phase 5 + raise NotImplementedError("OpenAI provider not yet implemented") + else: + raise ValueError(f"Unknown provider type: {provider_type}") + + async def stop_provider(self) -> None: + """Stop the active provider.""" + if self.active_provider: + # Only stop if it's not the default bundled provider + if self.active_provider is not self._default_provider: + if hasattr(self.active_provider, 'stop'): + await self.active_provider.stop() + self.active_provider = None + + # Stop subprocess if running + if self._provider_process: + self._provider_process.terminate() + try: + self._provider_process.wait(timeout=5) + except subprocess.TimeoutExpired: + self._provider_process.kill() + self._provider_process = None + self._provider_port = None + + async def list_installed(self) -> list[str]: + """ + List installed provider types. + + Returns: + List of installed provider type strings + """ + installed = [] + + # Bundled providers are always available + system = platform.system() + machine = platform.machine() + + if system == "Darwin" and machine == "arm64": + installed.append("bundled-mlx") + else: + installed.append("bundled-pytorch") + + # Check for downloaded providers (Phase 2) + providers_dir = _get_providers_dir() + if providers_dir.exists(): + for provider_file in providers_dir.glob("tts-provider-*"): + if provider_file.is_file() and provider_file.stat().st_size > 0: + name = provider_file.name + if "pytorch-cpu" in name: + installed.append("pytorch-cpu") + elif "pytorch-cuda" in name: + installed.append("pytorch-cuda") + + return installed + + async def get_provider_info(self, provider_type: str) -> dict: + """ + Get information about a provider. + + Args: + provider_type: Type of provider + + Returns: + Provider information dictionary + """ + if provider_type in ["bundled-mlx", "bundled-pytorch"]: + return { + "type": provider_type, + "name": "Bundled Provider", + "installed": True, + "size_mb": None, # Bundled, no separate size + } + elif provider_type == "pytorch-cpu": + return { + "type": provider_type, + "name": "PyTorch CPU", + "installed": provider_type in await self.list_installed(), + "size_mb": 300, + } + elif provider_type == "pytorch-cuda": + return { + "type": provider_type, + "name": "PyTorch CUDA", + "installed": provider_type in await self.list_installed(), + "size_mb": 2400, + } + else: + return { + "type": provider_type, + "name": provider_type, + "installed": False, + "size_mb": None, + } + + + def _get_free_port(self) -> int: + """Get a free port for the provider server.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(('', 0)) + return s.getsockname()[1] + + async def _wait_for_provider_health(self, base_url: str, timeout: int = 30) -> None: + """Wait for provider to become healthy.""" + import httpx + import asyncio + + start_time = asyncio.get_event_loop().time() + while True: + try: + async with httpx.AsyncClient(timeout=2.0) as client: + response = await client.get(f"{base_url}/tts/health") + if response.status_code == 200: + return + except Exception: + pass + + if asyncio.get_event_loop().time() - start_time > timeout: + raise TimeoutError(f"Provider did not become healthy within {timeout} seconds") + + await asyncio.sleep(0.5) + + +# Global provider manager instance +_provider_manager: Optional[ProviderManager] = None + + +def get_provider_manager() -> ProviderManager: + """Get the global provider manager instance.""" + global _provider_manager + if _provider_manager is None: + _provider_manager = ProviderManager() + return _provider_manager diff --git a/backend/providers/base.py b/backend/providers/base.py new file mode 100644 index 00000000..f3a6b4c2 --- /dev/null +++ b/backend/providers/base.py @@ -0,0 +1,97 @@ +""" +Base protocol for TTS providers. +""" + +from typing import Protocol, Optional, Tuple +from typing_extensions import runtime_checkable +import numpy as np + +from .types import ProviderHealth, ProviderStatus + + +@runtime_checkable +class TTSProvider(Protocol): + """Protocol for TTS provider implementations.""" + + async def generate( + self, + text: str, + voice_prompt: dict, + language: str = "en", + seed: Optional[int] = None, + instruct: Optional[str] = None, + ) -> Tuple[np.ndarray, int]: + """ + Generate speech audio from text. + + Args: + text: Text to synthesize + voice_prompt: Voice prompt dictionary + language: Language code + seed: Random seed for reproducibility + instruct: Delivery instructions + + Returns: + Tuple of (audio_array, sample_rate) + """ + ... + + async def create_voice_prompt( + self, + audio_path: str, + reference_text: str, + use_cache: bool = True, + ) -> Tuple[dict, bool]: + """ + Create voice prompt from reference audio. + + Args: + audio_path: Path to reference audio file + reference_text: Transcript of the audio + use_cache: Whether to use cached prompts + + Returns: + Tuple of (voice_prompt_dict, was_cached) + """ + ... + + async def combine_voice_prompts( + self, + audio_paths: list[str], + reference_texts: list[str], + ) -> Tuple[np.ndarray, str]: + """ + Combine multiple voice prompts. + + Args: + audio_paths: List of audio file paths + reference_texts: List of reference texts + + Returns: + Tuple of (combined_audio_array, combined_text) + """ + ... + + async def load_model(self, model_size: str) -> None: + """Load TTS model.""" + ... + + def unload_model(self) -> None: + """Unload model to free memory.""" + ... + + def is_loaded(self) -> bool: + """Check if model is loaded.""" + ... + + def _get_model_path(self, model_size: str) -> str: + """Get model path for a given size.""" + ... + + async def health(self) -> ProviderHealth: + """Get provider health status.""" + ... + + async def status(self) -> ProviderStatus: + """Get provider model status.""" + ... diff --git a/backend/providers/bundled.py b/backend/providers/bundled.py new file mode 100644 index 00000000..b4a5e2ca --- /dev/null +++ b/backend/providers/bundled.py @@ -0,0 +1,139 @@ +""" +Bundled provider that wraps existing MLX/PyTorch backends. +""" + +from typing import Optional, Tuple +import numpy as np +import platform + +from .base import TTSProvider +from .types import ProviderHealth, ProviderStatus +from ..backends import get_tts_backend, TTSBackend +from ..platform_detect import get_backend_type + + +class BundledProvider: + """Provider that wraps the existing bundled TTS backend.""" + + def __init__(self): + self._backend: Optional[TTSBackend] = None + + def _get_backend(self) -> TTSBackend: + """Get or create backend instance.""" + if self._backend is None: + self._backend = get_tts_backend() + return self._backend + + async def generate( + self, + text: str, + voice_prompt: dict, + language: str = "en", + seed: Optional[int] = None, + instruct: Optional[str] = None, + ) -> Tuple[np.ndarray, int]: + """Generate speech audio.""" + backend = self._get_backend() + return await backend.generate(text, voice_prompt, language, seed, instruct) + + async def create_voice_prompt( + self, + audio_path: str, + reference_text: str, + use_cache: bool = True, + ) -> Tuple[dict, bool]: + """Create voice prompt from reference audio.""" + backend = self._get_backend() + return await backend.create_voice_prompt(audio_path, reference_text, use_cache) + + async def combine_voice_prompts( + self, + audio_paths: list[str], + reference_texts: list[str], + ) -> Tuple[np.ndarray, str]: + """Combine multiple voice prompts.""" + backend = self._get_backend() + return await backend.combine_voice_prompts(audio_paths, reference_texts) + + async def load_model(self, model_size: str) -> None: + """Load TTS model.""" + backend = self._get_backend() + # Backends use load_model_async, but Protocol defines load_model + if hasattr(backend, 'load_model_async'): + await backend.load_model_async(model_size) + else: + await backend.load_model(model_size) + + def unload_model(self) -> None: + """Unload model to free memory.""" + backend = self._get_backend() + backend.unload_model() + + def is_loaded(self) -> bool: + """Check if model is loaded.""" + backend = self._get_backend() + return backend.is_loaded() + + def _get_model_path(self, model_size: str) -> str: + """Get model path for a given size.""" + backend = self._get_backend() + return backend._get_model_path(model_size) + + async def health(self) -> ProviderHealth: + """Get provider health status.""" + backend = self._get_backend() + backend_type = get_backend_type() + + model_size = None + if backend.is_loaded(): + # Try to get current model size from backend + if hasattr(backend, '_current_model_size') and backend._current_model_size: + model_size = backend._current_model_size + + device = None + if backend_type == "mlx": + device = "metal" + elif hasattr(backend, 'device'): + device = backend.device + + return ProviderHealth( + status="healthy", + provider=f"bundled-{backend_type}", + version=None, # Provider versioning not implemented yet + model=model_size, + device=device, + ) + + async def status(self) -> ProviderStatus: + """Get provider model status.""" + backend = self._get_backend() + backend_type = get_backend_type() + + model_size = None + if backend.is_loaded(): + if hasattr(backend, '_current_model_size') and backend._current_model_size: + model_size = backend._current_model_size + + available_sizes = ["1.7B"] + if backend_type == "pytorch": + available_sizes.append("0.6B") + + gpu_available = None + vram_used_mb = None + + if backend_type == "pytorch": + try: + import torch + gpu_available = torch.cuda.is_available() + if gpu_available: + vram_used_mb = torch.cuda.memory_allocated() / 1024 / 1024 + except ImportError: + pass + + return ProviderStatus( + model_loaded=backend.is_loaded(), + model_size=model_size, + available_sizes=available_sizes, + gpu_available=gpu_available, + vram_used_mb=int(vram_used_mb) if vram_used_mb else None, + ) diff --git a/backend/providers/installer.py b/backend/providers/installer.py new file mode 100644 index 00000000..08297b9d --- /dev/null +++ b/backend/providers/installer.py @@ -0,0 +1,211 @@ +""" +Provider download and installation manager. +""" + +import asyncio +import httpx +import platform +from pathlib import Path +from typing import Optional + +from .types import ProviderType +from ..utils.progress import get_progress_manager +from ..utils.tasks import get_task_manager + + +# Provider version (independent of app version) +PROVIDER_VERSION = "1.0.0" + +# Base URL for provider downloads (Cloudflare R2) +PROVIDER_DOWNLOAD_BASE_URL = "https://downloads.voicebox.sh/providers" + + +def _get_providers_dir() -> Path: + """Get the directory where providers are stored.""" + system = platform.system() + + if system == "Windows": + appdata = Path.home() / "AppData" / "Roaming" + elif system == "Darwin": + appdata = Path.home() / "Library" / "Application Support" + else: # Linux + appdata = Path.home() / ".local" / "share" + + providers_dir = appdata / "voicebox" / "providers" + providers_dir.mkdir(parents=True, exist_ok=True) + return providers_dir + + +def _get_provider_binary_name(provider_type: str) -> str: + """Get the local binary filename for a provider type.""" + system = platform.system() + ext = ".exe" if system == "Windows" else "" + + binary_map = { + "pytorch-cpu": f"tts-provider-pytorch-cpu{ext}", + "pytorch-cuda": f"tts-provider-pytorch-cuda{ext}", + } + + if provider_type not in binary_map: + raise ValueError(f"Unknown provider type: {provider_type}") + + return binary_map[provider_type] + + +def _get_provider_download_name(provider_type: str) -> str: + """Get the remote download filename for a provider type (includes platform suffix).""" + system = platform.system() + + if system == "Windows": + platform_suffix = "windows" + ext = ".exe" + elif system == "Linux": + platform_suffix = "linux" + ext = "" + else: + raise ValueError(f"Provider downloads not supported on {system}") + + return f"tts-provider-{provider_type}-{platform_suffix}{ext}" + + +def _get_provider_download_url(provider_type: str) -> str: + """Get the download URL for a provider.""" + download_name = _get_provider_download_name(provider_type) + return f"{PROVIDER_DOWNLOAD_BASE_URL}/v{PROVIDER_VERSION}/{download_name}" + + +async def download_provider(provider_type: str) -> Path: + """ + Download a provider binary from Cloudflare R2. + + Args: + provider_type: Type of provider to download (e.g., "pytorch-cpu") + + Returns: + Path to the downloaded provider binary + + Raises: + ValueError: If provider_type is invalid + httpx.HTTPError: If download fails + """ + if provider_type not in ["pytorch-cpu", "pytorch-cuda"]: + raise ValueError(f"Provider type {provider_type} cannot be downloaded") + + progress_manager = get_progress_manager() + task_manager = get_task_manager() + + binary_name = _get_provider_binary_name(provider_type) + download_url = _get_provider_download_url(provider_type) + destination = _get_providers_dir() / binary_name + + # Start tracking download + task_manager.start_download(provider_type) + + # Initialize progress state + progress_manager.update_progress( + model_name=provider_type, + current=0, + total=0, # Will be updated once we get Content-Length + filename=binary_name, + status="downloading", + ) + + try: + async with httpx.AsyncClient(timeout=300.0) as client: + # First, get the file size + async with client.stream("GET", download_url) as response: + response.raise_for_status() + + # Get total size from Content-Length header + total_size = int(response.headers.get("Content-Length", 0)) + + if total_size > 0: + progress_manager.update_progress( + model_name=provider_type, + current=0, + total=total_size, + filename=binary_name, + status="downloading", + ) + + # Download with progress tracking + downloaded = 0 + with open(destination, "wb") as f: + async for chunk in response.aiter_bytes(chunk_size=8192): + f.write(chunk) + downloaded += len(chunk) + + # Update progress + progress_manager.update_progress( + model_name=provider_type, + current=downloaded, + total=total_size if total_size > 0 else downloaded, + filename=binary_name, + status="downloading", + ) + + # Mark as complete + progress_manager.update_progress( + model_name=provider_type, + current=downloaded, + total=downloaded, + filename=binary_name, + status="complete", + ) + task_manager.complete_download(provider_type) + + # Make executable on Unix systems + if platform.system() != "Windows": + destination.chmod(0o755) + + return destination + + except Exception as e: + # Mark as error + progress_manager.update_progress( + model_name=provider_type, + current=0, + total=0, + filename=binary_name, + status="error", + ) + task_manager.error_download(provider_type, str(e)) + raise + + +def get_provider_binary_path(provider_type: str) -> Optional[Path]: + """ + Get the path to an installed provider binary. + + Args: + provider_type: Type of provider + + Returns: + Path to provider binary, or None if not installed + """ + binary_name = _get_provider_binary_name(provider_type) + provider_path = _get_providers_dir() / binary_name + + if provider_path.exists() and provider_path.is_file(): + return provider_path + + return None + + +def delete_provider(provider_type: str) -> bool: + """ + Delete an installed provider binary. + + Args: + provider_type: Type of provider to delete + + Returns: + True if deleted, False if not found + """ + provider_path = get_provider_binary_path(provider_type) + + if provider_path and provider_path.exists(): + provider_path.unlink() + return True + + return False diff --git a/backend/providers/local.py b/backend/providers/local.py new file mode 100644 index 00000000..1bde7bfe --- /dev/null +++ b/backend/providers/local.py @@ -0,0 +1,187 @@ +""" +Local provider that communicates with standalone provider servers via HTTP. +""" + +from typing import Optional, Tuple +import base64 +import io +import numpy as np +import httpx +import soundfile as sf + +from .base import TTSProvider +from .types import ProviderHealth, ProviderStatus + + +class LocalProvider: + """Provider that communicates with local subprocess via HTTP.""" + + def __init__(self, base_url: str): + """ + Initialize local provider. + + Args: + base_url: Base URL of the provider server (e.g., "http://localhost:8000") + """ + self.base_url = base_url.rstrip('/') + self.client = httpx.AsyncClient(timeout=300.0) # 5 minute timeout for generation + + async def generate( + self, + text: str, + voice_prompt: dict, + language: str = "en", + seed: Optional[int] = None, + instruct: Optional[str] = None, + ) -> Tuple[np.ndarray, int]: + """Generate speech audio.""" + response = await self.client.post( + f"{self.base_url}/tts/generate", + json={ + "text": text, + "voice_prompt": voice_prompt, + "language": language, + "seed": seed, + "model_size": "1.7B", # TODO: Make configurable + } + ) + response.raise_for_status() + data = response.json() + + # Decode base64 audio + audio_bytes = base64.b64decode(data["audio"]) + audio_buffer = io.BytesIO(audio_bytes) + audio, sample_rate = sf.read(audio_buffer) + + return audio, data["sample_rate"] + + async def create_voice_prompt( + self, + audio_path: str, + reference_text: str, + use_cache: bool = True, + ) -> Tuple[dict, bool]: + """Create voice prompt from reference audio.""" + # Read audio file + with open(audio_path, 'rb') as f: + audio_data = f.read() + + # Send multipart form data + files = { + "audio": ("audio.wav", audio_data, "audio/wav") + } + data = { + "reference_text": reference_text, + "use_cache": str(use_cache).lower(), + } + + response = await self.client.post( + f"{self.base_url}/tts/create_voice_prompt", + files=files, + data=data, + ) + response.raise_for_status() + result = response.json() + + return result["voice_prompt"], result.get("was_cached", False) + + async def combine_voice_prompts( + self, + audio_paths: list[str], + reference_texts: list[str], + ) -> Tuple[np.ndarray, str]: + """ + Combine multiple voice prompts. + + Note: This is not implemented in the provider API yet. + For now, we'll combine locally by concatenating audio. + """ + import numpy as np + from ..utils.audio import load_audio, normalize_audio + + combined_audio = [] + for audio_path in audio_paths: + audio, sr = load_audio(audio_path) + audio = normalize_audio(audio) + combined_audio.append(audio) + + # Concatenate audio + mixed = np.concatenate(combined_audio) + mixed = normalize_audio(mixed) + + # Combine texts + combined_text = " ".join(reference_texts) + + return mixed, combined_text + + async def load_model(self, model_size: str) -> None: + """Load TTS model.""" + # Model loading is handled automatically by the provider server + # when generate() is called, so this is a no-op + pass + + def unload_model(self) -> None: + """Unload model to free memory.""" + # Model unloading is handled by the provider server + # This is a no-op for local providers + pass + + def is_loaded(self) -> bool: + """Check if model is loaded.""" + # We can't know this without querying the provider + # Return True optimistically + return True + + def _get_model_path(self, model_size: str) -> str: + """Get model path for a given size.""" + # For local providers, model paths are handled by the provider server + # Return a placeholder + return f"Qwen/Qwen3-TTS-12Hz-{model_size}-Base" + + async def health(self) -> ProviderHealth: + """Get provider health status.""" + try: + response = await self.client.get(f"{self.base_url}/tts/health") + response.raise_for_status() + data = response.json() + return ProviderHealth( + status=data["status"], + provider=data["provider"], + version=data.get("version"), + model=data.get("model"), + device=data.get("device"), + ) + except Exception as e: + return ProviderHealth( + status="unhealthy", + provider="local", + version=None, + model=None, + device=None, + ) + + async def status(self) -> ProviderStatus: + """Get provider model status.""" + try: + response = await self.client.get(f"{self.base_url}/tts/status") + response.raise_for_status() + data = response.json() + return ProviderStatus( + model_loaded=data["model_loaded"], + model_size=data.get("model_size"), + available_sizes=data.get("available_sizes", []), + gpu_available=data.get("gpu_available"), + vram_used_mb=data.get("vram_used_mb"), + ) + except Exception as e: + return ProviderStatus( + model_loaded=False, + model_size=None, + available_sizes=[], + gpu_available=None, + vram_used_mb=None, + ) + + async def stop(self) -> None: + """Stop the provider (close HTTP client).""" + await self.client.aclose() diff --git a/backend/providers/types.py b/backend/providers/types.py new file mode 100644 index 00000000..8229cb74 --- /dev/null +++ b/backend/providers/types.py @@ -0,0 +1,34 @@ +""" +Shared types for TTS providers. +""" + +from typing import Optional, TypedDict +from enum import Enum + + +class ProviderType(str, Enum): + """Available provider types.""" + BUNDLED_MLX = "bundled-mlx" + BUNDLED_PYTORCH = "bundled-pytorch" + PYTORCH_CPU = "pytorch-cpu" + PYTORCH_CUDA = "pytorch-cuda" + REMOTE = "remote" + OPENAI = "openai" + + +class ProviderHealth(TypedDict): + """Provider health status.""" + status: str # "healthy", "unhealthy", "starting" + provider: str + version: Optional[str] + model: Optional[str] + device: Optional[str] + + +class ProviderStatus(TypedDict): + """Provider model status.""" + model_loaded: bool + model_size: Optional[str] + available_sizes: list[str] + gpu_available: Optional[bool] + vram_used_mb: Optional[int] diff --git a/backend/tts.py b/backend/tts.py index 98db3412..0f9cfd5f 100644 --- a/backend/tts.py +++ b/backend/tts.py @@ -1,5 +1,5 @@ """ -TTS inference module - delegates to backend abstraction layer. +TTS inference module - delegates to provider abstraction layer. """ from typing import Optional @@ -7,31 +7,51 @@ import io import soundfile as sf -from .backends import get_tts_backend, TTSBackend +from .backends import TTSBackend +from .providers import get_provider_manager +from .providers.base import TTSProvider -def get_tts_model() -> TTSBackend: +def get_tts_model() -> TTSProvider: """ - Get TTS backend instance (MLX or PyTorch based on platform). + Get TTS provider instance (via ProviderManager). Returns: - TTS backend instance + TTS provider instance """ - return get_tts_backend() + manager = get_provider_manager() + # Note: This is async but we need sync interface for backward compatibility + # In practice, this will be called from async contexts + import asyncio + try: + loop = asyncio.get_event_loop() + if loop.is_running(): + # We're in an async context, but can't await here + # Return a wrapper that will use the provider manager + return manager._get_default_provider() + else: + return loop.run_until_complete(manager.get_active_provider()) + except RuntimeError: + # No event loop, return default + return manager._get_default_provider() + + +async def get_tts_model_async() -> TTSProvider: + """ + Get TTS provider instance asynchronously. + + Returns: + TTS provider instance + """ + manager = get_provider_manager() + return await manager.get_active_provider() def unload_tts_model(): """Unload TTS model to free memory.""" - backend = get_tts_backend() - backend.unload_model() - - -def audio_to_wav_bytes(audio: np.ndarray, sample_rate: int) -> bytes: - """Convert audio array to WAV bytes.""" - buffer = io.BytesIO() - sf.write(buffer, audio, sample_rate, format="WAV") - buffer.seek(0) - return buffer.read() + manager = get_provider_manager() + provider = manager._get_default_provider() + provider.unload_model() def audio_to_wav_bytes(audio: np.ndarray, sample_rate: int) -> bytes: diff --git a/docs/plans/TTS_PROVIDER_ARCHITECTURE.md b/docs/plans/TTS_PROVIDER_ARCHITECTURE.md index 8d35a7e5..14d87317 100644 --- a/docs/plans/TTS_PROVIDER_ARCHITECTURE.md +++ b/docs/plans/TTS_PROVIDER_ARCHITECTURE.md @@ -10,14 +10,17 @@ Split the monolithic backend into modular components: -1. **Main App** (~150-200MB): Tauri + FastAPI backend + Whisper + UI/profiles/history -2. **TTS Providers** (downloadable plugins): Separate executables for model inference +1. **Main App**: + - Windows/Linux (~150MB): Tauri + FastAPI backend + Whisper + UI/profiles/history + - macOS (~300MB): Same + MLX bundled for simplicity +2. **TTS Providers** (Windows/Linux only): Downloadable executables for PyTorch CPU/CUDA inference This architecture solves: - ✅ GitHub 2GB release artifact limit -- ✅ Frequent app updates without re-downloading large python binaries -- ✅ User choice of compute backend (CPU/GPU/Cloud) +- ✅ Frequent app updates without re-downloading large python binaries (Windows/Linux) +- ✅ User choice of compute backend (CPU/GPU/Cloud) on Windows/Linux +- ✅ Simplified out-of-the-box experience on macOS - ✅ External provider support (OpenAI, custom servers) - ✅ Future extensibility @@ -25,6 +28,7 @@ This architecture solves: ## Architecture Diagram +### Windows / Linux ``` ┌─────────────────────────────────────────────────────────┐ │ Voicebox App (Tauri + Backend) ~150MB │ @@ -39,27 +43,43 @@ This architecture solves: │ HTTP/IPC │ │ - ┌────────────────────────────────┼─────────────────┐ - │ │ │ - ▼ ▼ ▼ -┌─────────────────┐ ┌─────────────────┐ ┌──────────────────┐ -│ TTS Provider: │ │ TTS Provider: │ │ TTS Provider: │ -│ PyTorch CPU │ │ PyTorch CUDA │ │ MLX (Apple) │ -│ │ │ │ │ │ -│ ~300MB │ │ ~2.4GB │ │ ~800MB │ -│ │ │ │ │ │ -│ Local inference │ │ GPU inference │ │ Metal inference │ -└─────────────────┘ └─────────────────┘ └──────────────────┘ - │ │ │ - └────────────────────────┴─────────────────────┘ - │ - ┌─────────────▼──────────────┐ - │ Future Providers: │ - │ • Remote Server │ - │ • OpenAI API │ - │ • ElevenLabs │ - │ • Custom Docker Container │ - └────────────────────────────┘ + ┌─────────────────────┴─────────────────┐ + │ │ + ▼ ▼ + ┌─────────────────────┐ ┌─────────────────────┐ + │ TTS Provider: │ │ TTS Provider: │ + │ PyTorch CPU │ │ PyTorch CUDA │ + │ │ │ │ + │ ~300MB │ │ ~2.4GB │ + │ │ │ │ + │ Local inference │ │ GPU inference │ + └─────────────────────┘ └─────────────────────┘ + │ │ + └───────────────┬───────────────────────┘ + │ + ┌─────────────▼──────────────┐ + │ Future Providers: │ + │ • Remote Server │ + │ • OpenAI API │ + │ • ElevenLabs │ + │ • Custom Docker Container │ + └────────────────────────────┘ +``` + +### macOS +``` +┌─────────────────────────────────────────────────────────┐ +│ Voicebox App (Tauri + Backend) ~300MB │ +│ ├─ UI Layer (React) │ +│ ├─ Backend (FastAPI) │ +│ │ ├─ Voice Profiles │ +│ │ ├─ Generation History │ +│ │ ├─ Audio Editing / Stories │ +│ │ └─ MLX Backend (bundled) │ +│ └─ Whisper (bundled, tiny ~50MB) │ +│ │ +│ No provider downloads needed - works out of the box │ +└─────────────────────────────────────────────────────────┘ ``` --- @@ -91,18 +111,20 @@ This architecture solves: #### 1. Main App (voicebox.exe / .app / .AppImage) -**Size:** ~100-150MB +**Windows/Linux Size:** ~100-150MB +**macOS Size:** ~300-350MB (includes MLX) **Includes:** - Tauri runtime + React UI -- FastAPI backend (pure Python, no PyTorch) +- FastAPI backend (pure Python, no PyTorch on Windows/Linux) - Whisper model (tiny, ~50MB) - SQLite database - Profile/history/audio editing logic -- Provider management system +- Provider management system (Windows/Linux only) +- **MLX backend (macOS only, bundled)** -**Does NOT include:** +**Does NOT include (Windows/Linux only):** - PyTorch (CPU or CUDA) - TTS models (Qwen3-TTS) @@ -147,23 +169,7 @@ This architecture solves: --- -#### 4. TTS Provider: MLX - -**Binary:** `tts-provider-mlx` -**Size:** ~150MB - -**Includes:** - -- MLX framework -- MLX-optimized Qwen3-TTS -- Metal acceleration - -**Platform:** macOS only (Apple Silicon) -**Download source:** Cloudflare R2 - ---- - -#### 5. TTS Provider: Remote +#### 4. TTS Provider: Remote **Binary:** None (built-in config) **Size:** 0MB @@ -182,7 +188,7 @@ This architecture solves: --- -#### 6. TTS Provider: OpenAI +#### 5. TTS Provider: OpenAI **Binary:** None (API wrapper) **Size:** 0MB @@ -296,7 +302,10 @@ Model status. ```python class ProviderManager: - """Manages TTS provider lifecycle.""" + """Manages TTS provider lifecycle (Windows/Linux only). + + Note: macOS uses bundled MLX backend directly, no provider management needed. + """ def __init__(self): self.active_provider: Optional[Provider] = None @@ -308,8 +317,6 @@ class ProviderManager: return await self._start_local_provider("tts-provider-pytorch-cpu.exe") elif provider_type == "pytorch-cuda": return await self._start_local_provider("tts-provider-pytorch-cuda.exe") - elif provider_type == "mlx": - return await self._start_local_provider("tts-provider-mlx") elif provider_type == "remote": return self.config["remote_url"] elif provider_type == "openai": @@ -434,15 +441,14 @@ class OpenAIProvider(TTSProvider): ```python class ProviderInstaller: - """Handles provider download and installation.""" + """Handles provider download and installation (Windows/Linux only).""" async def download_provider(self, provider_type: str): """Download provider binary from R2.""" binary_name = { "pytorch-cpu": "tts-provider-pytorch-cpu.exe", - "pytorch-cuda": "tts-provider-pytorch-cuda.exe", - "mlx": "tts-provider-mlx" + "pytorch-cuda": "tts-provider-pytorch-cuda.exe" }[provider_type] download_url = f"https://downloads.voicebox.sh/providers/v{PROVIDER_VERSION}/{binary_name}" @@ -525,44 +531,38 @@ export function ProviderSettings() { )}
- {/* PyTorch CPU */} -
-
- - -
- {!installedProviders?.includes("pytorch-cpu") && ( - - )} -
- - {/* MLX (macOS only) */} - {isMacOS && ( + {/* PyTorch CPU (Windows/Linux only) */} + {!isMacOS && (
- -
- {!installedProviders?.includes("mlx") && ( - )}
)} + {/* MLX bundled (macOS only) */} + {isMacOS && ( +
+
+
MLX (Apple Silicon)
+
+ Bundled with the app - optimized for M1/M2/M3 chips +
+
+
+ )} + {/* Remote */}
@@ -608,14 +608,18 @@ export function ProviderSettings() { ``` voicebox/ ├── backend/ -│ ├── main.py # Main FastAPI app (no TTS code) +│ ├── main.py # Main FastAPI app (no TTS on Win/Linux) +│ ├── backends/ +│ │ ├── __init__.py # Backend abstraction (existing) +│ │ ├── pytorch_backend.py # PyTorch backend (existing, for reference) +│ │ └── mlx_backend.py # MLX backend (bundled in macOS build only) │ ├── providers/ -│ │ ├── __init__.py # ProviderManager -│ │ ├── base.py # TTSProvider ABC +│ │ ├── __init__.py # ProviderManager (Windows/Linux) +│ │ ├── base.py # TTSProvider Protocol │ │ ├── local.py # LocalProvider (subprocess) │ │ ├── remote.py # RemoteProvider (HTTP) │ │ ├── openai.py # OpenAIProvider (API wrapper) -│ │ └── installer.py # Provider download logic +│ │ └── installer.py # Provider download logic (Windows/Linux) │ ├── profiles.py # Voice profile management │ ├── history.py # Generation history │ ├── transcribe.py # Whisper (still bundled) @@ -628,27 +632,22 @@ voicebox/ │ │ ├── requirements.txt # torch (CPU), qwen-tts, transformers │ │ └── build.spec # PyInstaller spec │ │ -│ ├── pytorch-cuda/ -│ │ ├── main.py # FastAPI server for TTS -│ │ ├── tts_backend.py # PyTorch TTS logic -│ │ ├── requirements.txt # torch+cu121, qwen-tts, transformers -│ │ └── build.spec # PyInstaller spec -│ │ -│ └── mlx/ +│ └── pytorch-cuda/ │ ├── main.py # FastAPI server for TTS -│ ├── mlx_backend.py # MLX TTS logic -│ ├── requirements.txt # mlx, qwen-tts-mlx +│ ├── tts_backend.py # PyTorch TTS logic +│ ├── requirements.txt # torch+cu121, qwen-tts, transformers │ └── build.spec # PyInstaller spec │ ├── app/ # Frontend (Tauri + React) │ └── src/ │ └── components/ │ └── ServerSettings/ -│ └── ProviderSettings.tsx +│ └── ProviderSettings.tsx # Only shown on Windows/Linux │ └── tauri/ └── src-tauri/ - └── tauri.conf.json # No externalBin for providers + └── tauri.conf.json # No externalBin for providers (Windows/Linux) + # MLX bundled in macOS build ``` --- @@ -671,33 +670,35 @@ voicebox/ ### Phase 2: Build Provider Binaries -**Goal:** Create standalone TTS provider executables +**Goal:** Create standalone TTS provider executables (Windows/Linux only) 1. Create separate PyInstaller specs for each provider 2. Build provider executables: - `tts-provider-pytorch-cpu.exe` (~300MB) - `tts-provider-pytorch-cuda.exe` (~2.4GB) - - `tts-provider-mlx` (~800MB, macOS) 3. Test subprocess communication 4. Upload providers to Cloudflare R2 **Result:** Provider binaries exist but aren't used yet +**Note:** macOS keeps MLX bundled in main app - no separate provider needed + --- ### Phase 3: Remove PyTorch from Main App -**Goal:** Split main app from providers +**Goal:** Split main app from providers (Windows/Linux only) -1. Exclude PyTorch/Qwen3-TTS from main app PyInstaller spec -2. Main app now requires provider download +1. Exclude PyTorch/Qwen3-TTS from Windows/Linux main app PyInstaller spec +2. Windows/Linux app now requires provider download 3. Update GitHub CI to build multiple artifacts: - - `voicebox-{version}-{platform}.exe` (~150MB) + - `voicebox-{version}-windows.exe` (~150MB, no TTS) + - `voicebox-{version}-linux.AppImage` (~150MB, no TTS) + - `voicebox-{version}-macos.app` (~300MB, MLX bundled) - `tts-provider-pytorch-cpu-{version}.exe` - `tts-provider-pytorch-cuda-{version}.exe` - - `tts-provider-mlx-{version}` (macOS) -**Result:** Main app is small, providers downloaded separately +**Result:** Windows/Linux apps are small with downloadable providers, macOS app is self-contained --- @@ -767,7 +768,7 @@ async def check_provider_compatibility(provider_version: str) -> bool: ## User Flows -### First-Time Setup +### First-Time Setup (Windows/Linux) 1. User downloads and installs Voicebox (~150MB) 2. App launches → detects no TTS provider installed @@ -784,10 +785,6 @@ async def check_provider_compatibility(provider_version: str) -> bool: ✓ Works on any system ✗ Slower inference - [ ] MLX (800MB) [Download] - ✓ Fast on Apple Silicon - ✗ macOS only (M1/M2/M3) - [ ] Remote Server URL: ___________________ @@ -799,19 +796,31 @@ async def check_provider_compatibility(provider_version: str) -> bool: 5. Provider installs to AppData/Application Support 6. App starts provider → ready to use +### First-Time Setup (macOS) + +1. User downloads and installs Voicebox (~300MB with MLX bundled) +2. App launches → MLX backend is ready immediately +3. No provider setup needed - works out of the box + --- ### App Update Flow (No Provider Change) **Scenario:** Bug fix in UI, no backend changes +**Windows/Linux:** 1. User gets update notification: "Voicebox v0.2.1 available" 2. Downloads update (~150MB, not 2.4GB!) 3. Installs and restarts 4. **Provider stays the same** (no re-download needed) 5. App starts using existing provider -**User experience:** Fast updates, no multi-GB downloads +**macOS:** +1. User gets update notification: "Voicebox v0.2.1 available" +2. Downloads update (~300MB with MLX bundled) +3. Installs and restarts - ready to use + +**User experience:** Fast updates, no multi-GB downloads (especially for CUDA users) --- @@ -846,9 +855,10 @@ async def check_provider_compatibility(provider_version: str) -> bool: | Benefit | Details | | ----------------------------- | --------------------------------------------------------- | -| **GitHub Releases Work** | Main app ~150MB << 2GB limit | +| **GitHub Releases Work** | Main app ~150MB (Win/Linux), ~300MB (macOS) << 2GB limit | | **Fast Updates** | UI/feature updates don't require re-downloading providers | -| **User Choice** | CPU, CUDA, MLX, OpenAI, remote server | +| **User Choice** | CPU, CUDA, OpenAI, remote server (Win/Linux) | +| **macOS Simplicity** | MLX bundled - works immediately, no provider setup needed | | **External Provider Support** | Users can run their own TTS servers | | **Bandwidth Savings** | Only download provider once, app updates are small | | **Future-Proof** | Easy to add new providers (ElevenLabs, custom models) | diff --git a/providers/README.md b/providers/README.md new file mode 100644 index 00000000..9ca3e03b --- /dev/null +++ b/providers/README.md @@ -0,0 +1,291 @@ +# TTS Provider Architecture + +This document explains how Voicebox's modular TTS provider system works. + +## Overview + +Voicebox uses a **pluggable provider architecture** that separates the main application from TTS inference. This solves several problems: + +- **GitHub's 2GB release limit** - CUDA builds are ~2.4GB, too large for GitHub releases +- **Faster app updates** - UI/feature updates don't require re-downloading heavy ML binaries +- **User choice** - Users can pick CPU, CUDA, or external providers based on their hardware + +## Architecture Diagram + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Voicebox App │ +│ ├─ UI (React) │ +│ ├─ Backend (FastAPI) │ +│ │ ├─ Voice Profiles │ +│ │ ├─ Generation History │ +│ │ ├─ Whisper STT (bundled) │ +│ │ └─ Provider Manager ◄────────────────┐ │ +│ │ │ │ +│ └─ providers/ │ │ +│ ├─ bundled.py (wraps backends/) │ │ +│ └─ local.py (HTTP client)─────────────┼───┐ │ +│ │ │ │ +└────────────────────────────────────────────┼───┼────────────┘ + │ │ + ┌────────────────────┘ │ + │ │ HTTP + ▼ ▼ + ┌──────────────────┐ ┌──────────────────────┐ + │ backends/ │ │ Standalone Provider │ + │ (bundled on Mac) │ │ (subprocess) │ + │ │ │ │ + │ - mlx_backend │ │ - FastAPI server │ + │ - pytorch_backend│ │ - PyTorch + Qwen-TTS │ + └──────────────────┘ │ - Runs on localhost │ + └──────────────────────┘ +``` + +## Platform Behavior + +| Platform | App Size | TTS Backend | Provider Download | +|----------|----------|-------------|-------------------| +| macOS (Apple Silicon) | ~300MB | MLX bundled | Not needed | +| macOS (Intel) | ~300MB | PyTorch bundled | Not needed | +| Windows | ~150MB | None bundled | Required | +| Linux | ~150MB | None bundled | Required | + +### macOS (Apple Silicon) +- MLX backend is **bundled** in the app +- Works immediately after install +- Uses Metal for GPU acceleration + +### macOS (Intel) +- PyTorch backend is **bundled** in the app +- Works immediately after install +- Uses CPU inference + +### Windows / Linux +- **No TTS bundled** - keeps app small (~150MB) +- On first use, prompts to download a provider +- Provider options: + - **PyTorch CPU** (~300MB) - Works on any system + - **PyTorch CUDA** (~2.4GB) - Fast inference on NVIDIA GPUs + +## Directory Structure + +``` +voicebox/ +├── backend/ +│ ├── backends/ # Actual TTS implementations +│ │ ├── __init__.py # TTSBackend Protocol +│ │ ├── mlx_backend.py # MLX implementation (macOS) +│ │ └── pytorch_backend.py # PyTorch implementation +│ │ +│ └── providers/ # Provider abstraction layer +│ ├── __init__.py # ProviderManager +│ ├── base.py # TTSProvider Protocol +│ ├── bundled.py # Wraps backends/ for bundled use +│ ├── local.py # HTTP client for subprocess providers +│ ├── installer.py # Downloads providers from R2 +│ └── types.py # Shared types +│ +└── providers/ # Standalone provider builds + ├── pytorch-cpu/ + │ ├── main.py # FastAPI server + │ ├── build.py # PyInstaller build script + │ └── requirements.txt + │ + └── pytorch-cuda/ + ├── main.py # FastAPI server + │ build.py # PyInstaller build script + └── requirements.txt +``` + +## How Providers Work + +### 1. BundledProvider (macOS) + +On macOS, the `BundledProvider` directly calls the bundled `backends/` code: + +```python +# backend/providers/bundled.py +class BundledProvider: + def __init__(self): + self._backend = get_tts_backend() # MLX or PyTorch + + async def generate(self, text, voice_prompt, ...): + return await self._backend.generate(text, voice_prompt, ...) +``` + +### 2. LocalProvider (Windows/Linux) + +On Windows/Linux, the `LocalProvider` communicates with a standalone provider via HTTP: + +```python +# backend/providers/local.py +class LocalProvider: + def __init__(self, base_url: str): + self.base_url = base_url # e.g., "http://127.0.0.1:8765" + + async def generate(self, text, voice_prompt, ...): + response = await self.client.post( + f"{self.base_url}/tts/generate", + json={"text": text, "voice_prompt": voice_prompt, ...} + ) + # Decode audio from response + return audio, sample_rate +``` + +### 3. Standalone Provider Server + +The standalone providers are self-contained FastAPI servers: + +```python +# providers/pytorch-cpu/main.py +@app.post("/tts/generate") +async def generate(text: str, voice_prompt: dict, ...): + audio, sr = await backend.generate(text, voice_prompt, ...) + return {"audio": base64_encode(audio), "sample_rate": sr} +``` + +## Provider API Specification + +All providers (local or remote) must implement these HTTP endpoints: + +### POST /tts/generate +Generate speech from text. + +**Request:** +```json +{ + "text": "Hello world!", + "voice_prompt": { /* voice embedding */ }, + "language": "en", + "seed": 12345, + "model_size": "1.7B" +} +``` + +**Response:** +```json +{ + "audio": "base64-encoded-wav", + "sample_rate": 24000, + "duration": 2.5 +} +``` + +### POST /tts/create_voice_prompt +Create voice embedding from reference audio. + +**Request:** `multipart/form-data` +- `audio`: Audio file +- `reference_text`: Transcript + +**Response:** +```json +{ + "voice_prompt": { /* voice embedding */ }, + "was_cached": false +} +``` + +### GET /tts/health +Health check. + +**Response:** +```json +{ + "status": "healthy", + "provider": "pytorch-cuda", + "version": "1.0.0", + "model": "1.7B", + "device": "cuda:0" +} +``` + +### GET /tts/status +Model status. + +**Response:** +```json +{ + "model_loaded": true, + "model_size": "1.7B", + "available_sizes": ["0.6B", "1.7B"], + "gpu_available": true, + "vram_used_mb": 1234 +} +``` + +## Provider Lifecycle + +### Startup Flow (Windows/Linux) + +``` +1. App launches +2. ProviderManager checks for installed providers +3. If none installed: + └─ Show setup wizard, prompt download +4. If installed: + ├─ Start provider subprocess on random port + ├─ Wait for /tts/health to return 200 + └─ Create LocalProvider with that URL +5. Generation requests go through LocalProvider → subprocess +``` + +### Download Flow + +``` +1. User clicks "Download PyTorch CUDA" +2. Installer downloads from Cloudflare R2: + https://downloads.voicebox.sh/providers/v1.0.0/tts-provider-pytorch-cuda-windows.exe +3. Saved to: + - Windows: %APPDATA%/voicebox/providers/ + - Linux: ~/.local/share/voicebox/providers/ +4. Provider is now available to start +``` + +## Building Providers + +### Prerequisites +- Python 3.12 +- PyInstaller + +### Build PyTorch CPU Provider +```bash +cd providers/pytorch-cpu +pip install -r requirements.txt +python build.py +# Output: dist/tts-provider-pytorch-cpu.exe +``` + +### Build PyTorch CUDA Provider +```bash +cd providers/pytorch-cuda +pip install torch --index-url https://download.pytorch.org/whl/cu121 +pip install -r requirements.txt +python build.py +# Output: dist/tts-provider-pytorch-cuda.exe (~2.4GB) +``` + +## Provider Versioning + +Providers have **independent versions** from the app: + +- **App version:** `v0.2.0` (frequent updates) +- **Provider version:** `v1.0.0` (rare updates) + +Providers only need updates when: +- TTS model changes (new Qwen3-TTS version) +- API spec changes +- Bug fixes in inference code + +The app checks provider compatibility on startup. + +## Future Providers + +The architecture supports additional providers: + +- **Remote Server** - Connect to your own TTS server +- **OpenAI API** - Use OpenAI's TTS (requires API key) +- **ElevenLabs** - Cloud TTS service +- **Docker** - Run providers in containers + +These would implement the same HTTP API spec. diff --git a/providers/pytorch-cpu/build.py b/providers/pytorch-cpu/build.py new file mode 100644 index 00000000..2276b686 --- /dev/null +++ b/providers/pytorch-cpu/build.py @@ -0,0 +1,82 @@ +""" +PyInstaller build script for PyTorch CPU provider. +""" + +import PyInstaller.__main__ +import os +import platform +from pathlib import Path + + +def build_provider(): + """Build PyTorch CPU provider as standalone binary.""" + provider_dir = Path(__file__).parent + backend_dir = provider_dir.parent.parent / "backend" + + # PyInstaller arguments + args = [ + 'main.py', + '--onefile', + '--name', 'tts-provider-pytorch-cpu', + ] + + # Add backend to path + args.extend([ + '--paths', str(backend_dir.parent), + ]) + + # Add hidden imports + args.extend([ + '--hidden-import', 'backend', + '--hidden-import', 'backend.backends', + '--hidden-import', 'backend.backends.pytorch_backend', + '--hidden-import', 'backend.config', + '--hidden-import', 'backend.utils.audio', + '--hidden-import', 'backend.utils.cache', + '--hidden-import', 'backend.utils.progress', + '--hidden-import', 'backend.utils.hf_progress', + '--hidden-import', 'backend.utils.tasks', + '--hidden-import', 'torch', + '--hidden-import', 'transformers', + '--hidden-import', 'qwen_tts', + '--hidden-import', 'qwen_tts.inference', + '--hidden-import', 'qwen_tts.inference.qwen3_tts_model', + '--hidden-import', 'qwen_tts.inference.qwen3_tts_tokenizer', + '--hidden-import', 'qwen_tts.core', + '--hidden-import', 'qwen_tts.cli', + '--copy-metadata', 'qwen-tts', + '--collect-submodules', 'qwen_tts', + '--collect-data', 'qwen_tts', + '--hidden-import', 'pkg_resources.extern', + '--collect-submodules', 'jaraco', + '--hidden-import', 'fastapi', + '--hidden-import', 'uvicorn', + '--hidden-import', 'soundfile', + '--hidden-import', 'numpy', + '--hidden-import', 'librosa', + ]) + + # Platform-specific extensions + if platform.system() == "Windows": + args[2] = 'tts-provider-pytorch-cpu.exe' + + args.extend([ + '--noconfirm', + '--clean', + ]) + + # Change to provider directory + os.chdir(provider_dir) + + # Run PyInstaller + PyInstaller.__main__.run(args) + + binary_name = 'tts-provider-pytorch-cpu' + if platform.system() == "Windows": + binary_name += '.exe' + + print(f"Binary built in {provider_dir / 'dist' / binary_name}") + + +if __name__ == '__main__': + build_provider() diff --git a/providers/pytorch-cpu/main.py b/providers/pytorch-cpu/main.py new file mode 100644 index 00000000..e295e08b --- /dev/null +++ b/providers/pytorch-cpu/main.py @@ -0,0 +1,238 @@ +""" +Standalone TTS provider server for PyTorch CPU. +""" + +import argparse +import asyncio +import base64 +import io +import sys +from pathlib import Path +from typing import Optional + +import numpy as np +import soundfile as sf +from fastapi import FastAPI, File, Form, HTTPException, UploadFile +from fastapi.middleware.cors import CORSMiddleware +import uvicorn + +# Add parent directory to path to import backend modules +sys.path.insert(0, str(Path(__file__).parent.parent.parent / "backend")) + +from backend.backends.pytorch_backend import PyTorchTTSBackend + + +app = FastAPI(title="Voicebox TTS Provider - PyTorch CPU") + +# CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Global backend instance +_backend: Optional[PyTorchTTSBackend] = None + + +def get_backend() -> PyTorchTTSBackend: + """Get or create backend instance.""" + global _backend + if _backend is None: + _backend = PyTorchTTSBackend() + return _backend + + +@app.get("/tts/health") +async def health(): + """Health check endpoint.""" + backend = get_backend() + backend_type = "pytorch-cpu" + + model_size = None + if backend.is_loaded(): + if hasattr(backend, '_current_model_size') and backend._current_model_size: + model_size = backend._current_model_size + + device = backend.device if hasattr(backend, 'device') else "cpu" + + return { + "status": "healthy", + "provider": backend_type, + "version": "1.0.0", # TODO: Get from version file + "model": model_size, + "device": device, + } + + +@app.get("/tts/status") +async def status(): + """Model status endpoint.""" + backend = get_backend() + + model_size = None + if backend.is_loaded(): + if hasattr(backend, '_current_model_size') and backend._current_model_size: + model_size = backend._current_model_size + + available_sizes = ["1.7B", "0.6B"] + + gpu_available = False + vram_used_mb = None + + try: + import torch + gpu_available = torch.cuda.is_available() + if gpu_available: + vram_used_mb = int(torch.cuda.memory_allocated() / 1024 / 1024) + except ImportError: + pass + + return { + "model_loaded": backend.is_loaded(), + "model_size": model_size, + "available_sizes": available_sizes, + "gpu_available": gpu_available, + "vram_used_mb": vram_used_mb, + } + + +@app.post("/tts/generate") +async def generate( + text: str, + voice_prompt: dict, + language: str = "en", + seed: Optional[int] = None, + model_size: str = "1.7B", +): + """ + Generate speech from text. + + Request body (JSON): + { + "text": "Hello world!", + "voice_prompt": {...}, + "language": "en", + "seed": 12345, + "model_size": "1.7B" + } + """ + backend = get_backend() + + # Load model if not loaded or different size + if not backend.is_loaded() or ( + hasattr(backend, '_current_model_size') and + backend._current_model_size != model_size + ): + await backend.load_model_async(model_size) + + # Generate audio + audio, sample_rate = await backend.generate( + text=text, + voice_prompt=voice_prompt, + language=language, + seed=seed, + instruct=None, # TODO: Add instruct support + ) + + # Convert to base64 + buffer = io.BytesIO() + sf.write(buffer, audio, sample_rate, format="WAV") + buffer.seek(0) + audio_bytes = buffer.read() + audio_b64 = base64.b64encode(audio_bytes).decode('utf-8') + + # Calculate duration + duration = len(audio) / sample_rate + + return { + "audio": audio_b64, + "sample_rate": sample_rate, + "duration": duration, + } + + +@app.post("/tts/create_voice_prompt") +async def create_voice_prompt( + audio: UploadFile = File(...), + reference_text: str = Form(...), + use_cache: bool = Form(True), +): + """ + Create voice prompt from reference audio. + + Request (multipart/form-data): + - audio: Audio file + - reference_text: Transcript + - use_cache: Whether to use cached prompts (default: true) + """ + backend = get_backend() + + # Save uploaded file temporarily + import tempfile + with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: + tmp_path = tmp_file.name + content = await audio.read() + tmp_file.write(content) + + try: + # Create voice prompt + voice_prompt, was_cached = await backend.create_voice_prompt( + audio_path=tmp_path, + reference_text=reference_text, + use_cache=use_cache, + ) + + return { + "voice_prompt": voice_prompt, + "was_cached": was_cached, + } + finally: + # Clean up temp file + Path(tmp_path).unlink(missing_ok=True) + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser(description="Voicebox TTS Provider - PyTorch CPU") + parser.add_argument( + "--port", + type=int, + default=0, # 0 means random port + help="Port to bind to", + ) + parser.add_argument( + "--data-dir", + type=str, + default=None, + help="Data directory for models and cache", + ) + args = parser.parse_args() + + # Set data directory if provided + if args.data_dir: + from backend import config + config.set_data_dir(args.data_dir) + + # Determine port + port = args.port + if port == 0: + import socket + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(('', 0)) + port = s.getsockname()[1] + + print(f"Starting TTS Provider (PyTorch CPU) on port {port}") + + uvicorn.run( + app, + host="127.0.0.1", + port=port, + log_level="info", + ) + + +if __name__ == "__main__": + main() diff --git a/providers/pytorch-cpu/requirements.txt b/providers/pytorch-cpu/requirements.txt new file mode 100644 index 00000000..35593b57 --- /dev/null +++ b/providers/pytorch-cpu/requirements.txt @@ -0,0 +1,8 @@ +torch>=2.0.0 +transformers>=4.30.0 +qwen-tts>=0.1.0 +fastapi>=0.100.0 +uvicorn>=0.23.0 +soundfile>=0.12.0 +numpy>=1.24.0 +librosa>=0.10.0 diff --git a/providers/pytorch-cuda/build.py b/providers/pytorch-cuda/build.py new file mode 100644 index 00000000..c6e56e7b --- /dev/null +++ b/providers/pytorch-cuda/build.py @@ -0,0 +1,84 @@ +""" +PyInstaller build script for PyTorch CUDA provider. +""" + +import PyInstaller.__main__ +import os +import platform +from pathlib import Path + + +def build_provider(): + """Build PyTorch CUDA provider as standalone binary.""" + provider_dir = Path(__file__).parent + backend_dir = provider_dir.parent.parent / "backend" + + # PyInstaller arguments + args = [ + 'main.py', + '--onefile', + '--name', 'tts-provider-pytorch-cuda', + ] + + # Add backend to path + args.extend([ + '--paths', str(backend_dir.parent), + ]) + + # Add hidden imports + args.extend([ + '--hidden-import', 'backend', + '--hidden-import', 'backend.backends', + '--hidden-import', 'backend.backends.pytorch_backend', + '--hidden-import', 'backend.config', + '--hidden-import', 'backend.utils.audio', + '--hidden-import', 'backend.utils.cache', + '--hidden-import', 'backend.utils.progress', + '--hidden-import', 'backend.utils.hf_progress', + '--hidden-import', 'backend.utils.tasks', + '--hidden-import', 'torch', + '--hidden-import', 'torch.cuda', + '--hidden-import', 'torch.backends.cudnn', + '--hidden-import', 'transformers', + '--hidden-import', 'qwen_tts', + '--hidden-import', 'qwen_tts.inference', + '--hidden-import', 'qwen_tts.inference.qwen3_tts_model', + '--hidden-import', 'qwen_tts.inference.qwen3_tts_tokenizer', + '--hidden-import', 'qwen_tts.core', + '--hidden-import', 'qwen_tts.cli', + '--copy-metadata', 'qwen-tts', + '--collect-submodules', 'qwen_tts', + '--collect-data', 'qwen_tts', + '--hidden-import', 'pkg_resources.extern', + '--collect-submodules', 'jaraco', + '--hidden-import', 'fastapi', + '--hidden-import', 'uvicorn', + '--hidden-import', 'soundfile', + '--hidden-import', 'numpy', + '--hidden-import', 'librosa', + ]) + + # Platform-specific extensions + if platform.system() == "Windows": + args[2] = 'tts-provider-pytorch-cuda.exe' + + args.extend([ + '--noconfirm', + '--clean', + ]) + + # Change to provider directory + os.chdir(provider_dir) + + # Run PyInstaller + PyInstaller.__main__.run(args) + + binary_name = 'tts-provider-pytorch-cuda' + if platform.system() == "Windows": + binary_name += '.exe' + + print(f"Binary built in {provider_dir / 'dist' / binary_name}") + + +if __name__ == '__main__': + build_provider() diff --git a/providers/pytorch-cuda/main.py b/providers/pytorch-cuda/main.py new file mode 100644 index 00000000..a89782ff --- /dev/null +++ b/providers/pytorch-cuda/main.py @@ -0,0 +1,238 @@ +""" +Standalone TTS provider server for PyTorch CUDA. +""" + +import argparse +import asyncio +import base64 +import io +import sys +from pathlib import Path +from typing import Optional + +import numpy as np +import soundfile as sf +from fastapi import FastAPI, File, Form, HTTPException, UploadFile +from fastapi.middleware.cors import CORSMiddleware +import uvicorn + +# Add parent directory to path to import backend modules +sys.path.insert(0, str(Path(__file__).parent.parent.parent / "backend")) + +from backend.backends.pytorch_backend import PyTorchTTSBackend + + +app = FastAPI(title="Voicebox TTS Provider - PyTorch CUDA") + +# CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# Global backend instance +_backend: Optional[PyTorchTTSBackend] = None + + +def get_backend() -> PyTorchTTSBackend: + """Get or create backend instance.""" + global _backend + if _backend is None: + _backend = PyTorchTTSBackend() + return _backend + + +@app.get("/tts/health") +async def health(): + """Health check endpoint.""" + backend = get_backend() + backend_type = "pytorch-cuda" + + model_size = None + if backend.is_loaded(): + if hasattr(backend, '_current_model_size') and backend._current_model_size: + model_size = backend._current_model_size + + device = backend.device if hasattr(backend, 'device') else "cpu" + + return { + "status": "healthy", + "provider": backend_type, + "version": "1.0.0", # TODO: Get from version file + "model": model_size, + "device": device, + } + + +@app.get("/tts/status") +async def status(): + """Model status endpoint.""" + backend = get_backend() + + model_size = None + if backend.is_loaded(): + if hasattr(backend, '_current_model_size') and backend._current_model_size: + model_size = backend._current_model_size + + available_sizes = ["1.7B", "0.6B"] + + gpu_available = False + vram_used_mb = None + + try: + import torch + gpu_available = torch.cuda.is_available() + if gpu_available: + vram_used_mb = int(torch.cuda.memory_allocated() / 1024 / 1024) + except ImportError: + pass + + return { + "model_loaded": backend.is_loaded(), + "model_size": model_size, + "available_sizes": available_sizes, + "gpu_available": gpu_available, + "vram_used_mb": vram_used_mb, + } + + +@app.post("/tts/generate") +async def generate( + text: str, + voice_prompt: dict, + language: str = "en", + seed: Optional[int] = None, + model_size: str = "1.7B", +): + """ + Generate speech from text. + + Request body (JSON): + { + "text": "Hello world!", + "voice_prompt": {...}, + "language": "en", + "seed": 12345, + "model_size": "1.7B" + } + """ + backend = get_backend() + + # Load model if not loaded or different size + if not backend.is_loaded() or ( + hasattr(backend, '_current_model_size') and + backend._current_model_size != model_size + ): + await backend.load_model_async(model_size) + + # Generate audio + audio, sample_rate = await backend.generate( + text=text, + voice_prompt=voice_prompt, + language=language, + seed=seed, + instruct=None, # TODO: Add instruct support + ) + + # Convert to base64 + buffer = io.BytesIO() + sf.write(buffer, audio, sample_rate, format="WAV") + buffer.seek(0) + audio_bytes = buffer.read() + audio_b64 = base64.b64encode(audio_bytes).decode('utf-8') + + # Calculate duration + duration = len(audio) / sample_rate + + return { + "audio": audio_b64, + "sample_rate": sample_rate, + "duration": duration, + } + + +@app.post("/tts/create_voice_prompt") +async def create_voice_prompt( + audio: UploadFile = File(...), + reference_text: str = Form(...), + use_cache: bool = Form(True), +): + """ + Create voice prompt from reference audio. + + Request (multipart/form-data): + - audio: Audio file + - reference_text: Transcript + - use_cache: Whether to use cached prompts (default: true) + """ + backend = get_backend() + + # Save uploaded file temporarily + import tempfile + with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: + tmp_path = tmp_file.name + content = await audio.read() + tmp_file.write(content) + + try: + # Create voice prompt + voice_prompt, was_cached = await backend.create_voice_prompt( + audio_path=tmp_path, + reference_text=reference_text, + use_cache=use_cache, + ) + + return { + "voice_prompt": voice_prompt, + "was_cached": was_cached, + } + finally: + # Clean up temp file + Path(tmp_path).unlink(missing_ok=True) + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser(description="Voicebox TTS Provider - PyTorch CUDA") + parser.add_argument( + "--port", + type=int, + default=0, # 0 means random port + help="Port to bind to", + ) + parser.add_argument( + "--data-dir", + type=str, + default=None, + help="Data directory for models and cache", + ) + args = parser.parse_args() + + # Set data directory if provided + if args.data_dir: + from backend import config + config.set_data_dir(args.data_dir) + + # Determine port + port = args.port + if port == 0: + import socket + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(('', 0)) + port = s.getsockname()[1] + + print(f"Starting TTS Provider (PyTorch CUDA) on port {port}") + + uvicorn.run( + app, + host="127.0.0.1", + port=port, + log_level="info", + ) + + +if __name__ == "__main__": + main() diff --git a/providers/pytorch-cuda/requirements.txt b/providers/pytorch-cuda/requirements.txt new file mode 100644 index 00000000..0f5eafd0 --- /dev/null +++ b/providers/pytorch-cuda/requirements.txt @@ -0,0 +1,10 @@ +torch>=2.0.0 --index-url https://download.pytorch.org/whl/cu121 +torchvision>=0.15.0 --index-url https://download.pytorch.org/whl/cu121 +torchaudio>=2.0.0 --index-url https://download.pytorch.org/whl/cu121 +transformers>=4.30.0 +qwen-tts>=0.1.0 +fastapi>=0.100.0 +uvicorn>=0.23.0 +soundfile>=0.12.0 +numpy>=1.24.0 +librosa>=0.10.0 From d89521559aa0bb9d3266ff3967eb0319ba9f74ca Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sat, 31 Jan 2026 03:09:49 -0800 Subject: [PATCH 02/33] Update LocalProvider to manage model size dynamically and clean up build scripts - Introduced a new attribute `_current_model_size` in `LocalProvider` to store the current model size, allowing for dynamic configuration during generation. - Updated the `generate` method to use the current model size instead of a hardcoded value. - Modified the `load_model` method to track the requested model size. - Removed platform-specific extension handling from the build scripts for both CPU and CUDA providers to streamline the build process. --- backend/providers/local.py | 9 +++++---- providers/pytorch-cpu/build.py | 4 ---- providers/pytorch-cuda/build.py | 4 ---- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/backend/providers/local.py b/backend/providers/local.py index 1bde7bfe..004e6fc3 100644 --- a/backend/providers/local.py +++ b/backend/providers/local.py @@ -25,6 +25,7 @@ def __init__(self, base_url: str): """ self.base_url = base_url.rstrip('/') self.client = httpx.AsyncClient(timeout=300.0) # 5 minute timeout for generation + self._current_model_size = "1.7B" # Default model size async def generate( self, @@ -42,7 +43,7 @@ async def generate( "voice_prompt": voice_prompt, "language": language, "seed": seed, - "model_size": "1.7B", # TODO: Make configurable + "model_size": self._current_model_size, } ) response.raise_for_status() @@ -116,9 +117,9 @@ async def combine_voice_prompts( async def load_model(self, model_size: str) -> None: """Load TTS model.""" - # Model loading is handled automatically by the provider server - # when generate() is called, so this is a no-op - pass + # Track the requested model size - the provider server will load it + # when generate() is called with this size + self._current_model_size = model_size def unload_model(self) -> None: """Unload model to free memory.""" diff --git a/providers/pytorch-cpu/build.py b/providers/pytorch-cpu/build.py index 2276b686..8894edcb 100644 --- a/providers/pytorch-cpu/build.py +++ b/providers/pytorch-cpu/build.py @@ -56,10 +56,6 @@ def build_provider(): '--hidden-import', 'librosa', ]) - # Platform-specific extensions - if platform.system() == "Windows": - args[2] = 'tts-provider-pytorch-cpu.exe' - args.extend([ '--noconfirm', '--clean', diff --git a/providers/pytorch-cuda/build.py b/providers/pytorch-cuda/build.py index c6e56e7b..8ff67334 100644 --- a/providers/pytorch-cuda/build.py +++ b/providers/pytorch-cuda/build.py @@ -58,10 +58,6 @@ def build_provider(): '--hidden-import', 'librosa', ]) - # Platform-specific extensions - if platform.system() == "Windows": - args[2] = 'tts-provider-pytorch-cuda.exe' - args.extend([ '--noconfirm', '--clean', From ec9402c568c3104cb55532721541c49bddd18ccd Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sat, 31 Jan 2026 03:11:34 -0800 Subject: [PATCH 03/33] Update qwen-tts version in requirements for CPU and CUDA providers from 0.1.0 to 0.0.5 --- providers/pytorch-cpu/requirements.txt | 2 +- providers/pytorch-cuda/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/providers/pytorch-cpu/requirements.txt b/providers/pytorch-cpu/requirements.txt index 35593b57..3f4144e3 100644 --- a/providers/pytorch-cpu/requirements.txt +++ b/providers/pytorch-cpu/requirements.txt @@ -1,6 +1,6 @@ torch>=2.0.0 transformers>=4.30.0 -qwen-tts>=0.1.0 +qwen-tts>=0.0.5 fastapi>=0.100.0 uvicorn>=0.23.0 soundfile>=0.12.0 diff --git a/providers/pytorch-cuda/requirements.txt b/providers/pytorch-cuda/requirements.txt index 0f5eafd0..5a106503 100644 --- a/providers/pytorch-cuda/requirements.txt +++ b/providers/pytorch-cuda/requirements.txt @@ -2,7 +2,7 @@ torch>=2.0.0 --index-url https://download.pytorch.org/whl/cu121 torchvision>=0.15.0 --index-url https://download.pytorch.org/whl/cu121 torchaudio>=2.0.0 --index-url https://download.pytorch.org/whl/cu121 transformers>=4.30.0 -qwen-tts>=0.1.0 +qwen-tts>=0.0.5 fastapi>=0.100.0 uvicorn>=0.23.0 soundfile>=0.12.0 From ec0fb601978d1983abc8eecf4ba88f30c395996c Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sat, 31 Jan 2026 03:40:27 -0800 Subject: [PATCH 04/33] Enhance release workflow and add radio group component - Updated the release workflow to include a new configuration for the Ubuntu 22.04 platform without TTS bundled. - Added the @radix-ui/react-radio-group dependency to package.json. - Implemented a new RadioGroup component for better UI handling of radio inputs. --- .github/workflows/release.yml | 8 ++--- app/package.json | 1 + app/src/components/ui/radio-group.tsx | 44 +++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 4 deletions(-) create mode 100644 app/src/components/ui/radio-group.tsx diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index de067112..07f5c98d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -132,10 +132,10 @@ jobs: python-version: "3.12" backend: "pytorch" # Linux - No TTS bundled, providers downloaded separately - # - platform: 'ubuntu-22.04' - # args: '' - # python-version: '3.12' - # backend: 'none' + - platform: "ubuntu-22.04" + args: "" + python-version: "3.12" + backend: "none" # Windows - No TTS bundled, providers downloaded separately - platform: "windows-latest" args: "" diff --git a/app/package.json b/app/package.json index 905dea23..00170cca 100644 --- a/app/package.json +++ b/app/package.json @@ -24,6 +24,7 @@ "@radix-ui/react-label": "^2.1.0", "@radix-ui/react-popover": "^1.1.1", "@radix-ui/react-progress": "^1.1.0", + "@radix-ui/react-radio-group": "^1.2.0", "@radix-ui/react-scroll-area": "^1.1.0", "@radix-ui/react-select": "^2.1.1", "@radix-ui/react-separator": "^1.1.0", diff --git a/app/src/components/ui/radio-group.tsx b/app/src/components/ui/radio-group.tsx new file mode 100644 index 00000000..fed418e6 --- /dev/null +++ b/app/src/components/ui/radio-group.tsx @@ -0,0 +1,44 @@ +"use client" + +import * as React from "react" +import * as RadioGroupPrimitive from "@radix-ui/react-radio-group" +import { Circle } from "lucide-react" + +import { cn } from "@/lib/utils/cn" + +const RadioGroup = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => { + return ( + + ) +}) +RadioGroup.displayName = RadioGroupPrimitive.Root.displayName + +const RadioGroupItem = React.forwardRef< + React.ElementRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => { + return ( + + + + + + ) +}) +RadioGroupItem.displayName = RadioGroupPrimitive.Item.displayName + +export { RadioGroup, RadioGroupItem } From ce4269ffa545cd9b908cb9df916718fcd9784a7f Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sat, 31 Jan 2026 03:45:41 -0800 Subject: [PATCH 05/33] Refactor build scripts and update release workflow - Commented out the PyTorch CPU configuration in the release workflow for Ubuntu 22.04. - Updated TTS provider documentation to clarify options for Windows and Linux users. - Enhanced build scripts for both CPU and CUDA providers by excluding large unused modules to reduce binary size. --- .github/workflows/release.yml | 12 ++++++------ providers/pytorch-cpu/build.py | 18 ++++++++++++++++++ providers/pytorch-cuda/build.py | 18 ++++++++++++++++++ 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 07f5c98d..ebf84c01 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -28,9 +28,9 @@ jobs: provider: "pytorch-cuda" python-version: "3.12" # PyTorch CPU provider (Linux) - - platform: "ubuntu-22.04" - provider: "pytorch-cpu" - python-version: "3.12" + # - platform: "ubuntu-22.04" + # provider: "pytorch-cpu" + # python-version: "3.12" # PyTorch CUDA provider (Linux) - large binary, uploaded to R2 - platform: "ubuntu-22.04" provider: "pytorch-cuda" @@ -268,10 +268,10 @@ jobs: - **Windows**: Download the `.msi` installer - requires downloading a TTS provider on first use - **Linux**: Download the `.AppImage` or `.deb` package - requires downloading a TTS provider on first use - ### TTS Providers (Windows/Linux) + ### TTS Providers Windows and Linux users will be prompted to download a TTS provider on first launch: - - **PyTorch CPU** (~300MB) - Works on any system - - **PyTorch CUDA** (~2.4GB) - 4-5x faster on NVIDIA GPUs + - **Windows**: PyTorch CPU (~300MB) or PyTorch CUDA (~2.4GB for NVIDIA GPUs) + - **Linux**: PyTorch CUDA (~2.4GB) - requires NVIDIA GPU The app includes automatic updates - future updates will be installed automatically. releaseDraft: true diff --git a/providers/pytorch-cpu/build.py b/providers/pytorch-cpu/build.py index 8894edcb..6596c266 100644 --- a/providers/pytorch-cpu/build.py +++ b/providers/pytorch-cpu/build.py @@ -56,6 +56,24 @@ def build_provider(): '--hidden-import', 'librosa', ]) + # Exclude large unused modules to reduce binary size + args.extend([ + '--exclude-module', 'torch.utils.tensorboard', + '--exclude-module', 'tensorboard', + '--exclude-module', 'triton', + '--exclude-module', 'torch.distributed', + '--exclude-module', 'torch._dynamo', + '--exclude-module', 'torch._inductor', + '--exclude-module', 'torch.testing', + '--exclude-module', 'torch.utils.benchmark', + '--exclude-module', 'IPython', + '--exclude-module', 'matplotlib', + '--exclude-module', 'PIL', + '--exclude-module', 'cv2', + '--exclude-module', 'torchvision', + '--exclude-module', 'torchaudio', + ]) + args.extend([ '--noconfirm', '--clean', diff --git a/providers/pytorch-cuda/build.py b/providers/pytorch-cuda/build.py index 8ff67334..565885b2 100644 --- a/providers/pytorch-cuda/build.py +++ b/providers/pytorch-cuda/build.py @@ -58,6 +58,24 @@ def build_provider(): '--hidden-import', 'librosa', ]) + # Exclude large unused modules to reduce binary size + args.extend([ + '--exclude-module', 'torch.utils.tensorboard', + '--exclude-module', 'tensorboard', + '--exclude-module', 'triton', + '--exclude-module', 'torch.distributed', + '--exclude-module', 'torch._dynamo', + '--exclude-module', 'torch._inductor', + '--exclude-module', 'torch.testing', + '--exclude-module', 'torch.utils.benchmark', + '--exclude-module', 'IPython', + '--exclude-module', 'matplotlib', + '--exclude-module', 'PIL', + '--exclude-module', 'cv2', + '--exclude-module', 'torchvision', + '--exclude-module', 'torchaudio', + ]) + args.extend([ '--noconfirm', '--clean', From ab10c26ce44a98692048c61b2fea293ff62a6931 Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sat, 31 Jan 2026 04:11:30 -0800 Subject: [PATCH 06/33] Update release workflow to include libasound2-dev dependency for Ubuntu 22.04 --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ebf84c01..cce4670b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -151,7 +151,7 @@ jobs: if: matrix.platform == 'ubuntu-22.04' run: | sudo apt-get update - sudo apt-get install -y libwebkit2gtk-4.1-dev libappindicator3-dev librsvg2-dev patchelf llvm-dev + sudo apt-get install -y libwebkit2gtk-4.1-dev libappindicator3-dev librsvg2-dev patchelf llvm-dev libasound2-dev - name: Install LLVM (macOS) if: matrix.platform == 'macos-latest' || matrix.platform == 'macos-15-intel' From a52ff7d9504a5fba5d446b34ebe49ec5fda90c33 Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sat, 31 Jan 2026 05:10:27 -0800 Subject: [PATCH 07/33] Add Linux audio capture module with unsupported functionality - Introduced a new `linux.rs` module for audio capture, indicating that audio capture is not supported on Linux at this time. - Updated `mod.rs` to include the Linux module conditionally based on the target OS. --- tauri/src-tauri/src/audio_capture/linux.rs | 16 ++++++++++++++++ tauri/src-tauri/src/audio_capture/mod.rs | 4 ++++ 2 files changed, 20 insertions(+) create mode 100644 tauri/src-tauri/src/audio_capture/linux.rs diff --git a/tauri/src-tauri/src/audio_capture/linux.rs b/tauri/src-tauri/src/audio_capture/linux.rs new file mode 100644 index 00000000..7550e086 --- /dev/null +++ b/tauri/src-tauri/src/audio_capture/linux.rs @@ -0,0 +1,16 @@ +use crate::audio_capture::AudioCaptureState; + +pub async fn start_capture( + _state: &AudioCaptureState, + _max_duration_secs: u32, +) -> Result<(), String> { + Err("Audio capture is not supported on Linux, YET. Use the built-in recording features instead.".to_string()) +} + +pub async fn stop_capture(_state: &AudioCaptureState) -> Result { + Err("Audio capture is not supported on Linux.".to_string()) +} + +pub fn is_supported() -> bool { + false +} diff --git a/tauri/src-tauri/src/audio_capture/mod.rs b/tauri/src-tauri/src/audio_capture/mod.rs index 7a55c334..a67bf795 100644 --- a/tauri/src-tauri/src/audio_capture/mod.rs +++ b/tauri/src-tauri/src/audio_capture/mod.rs @@ -2,11 +2,15 @@ mod macos; #[cfg(target_os = "windows")] mod windows; +#[cfg(target_os = "linux")] +mod linux; #[cfg(target_os = "macos")] pub use macos::*; #[cfg(target_os = "windows")] pub use windows::*; +#[cfg(target_os = "linux")] +pub use linux::*; use std::sync::{Arc, Mutex}; From 942064912a3bdd6fec0e5517773b3bb88e51234a Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sat, 31 Jan 2026 20:18:27 -0800 Subject: [PATCH 08/33] Update TTS provider methods and dependencies - Renamed `load_model` to `load_model_async` in TTS provider classes for clarity and consistency. - Added compatibility alias for `load_model` to maintain existing functionality. - Enhanced `get_model_status` to handle both synchronous and asynchronous check functions. - Updated version numbers in `bun.lock` and `Cargo.lock` to 0.1.12, reflecting recent changes. --- backend/main.py | 14 ++++++++++++-- backend/providers/base.py | 2 +- backend/providers/bundled.py | 6 ++++-- backend/providers/local.py | 5 ++++- bun.lock | 11 +++++++---- tauri/src-tauri/Cargo.lock | 2 +- 6 files changed, 29 insertions(+), 11 deletions(-) diff --git a/backend/main.py b/backend/main.py index 3bd4b7e6..f6d00a1f 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1362,7 +1362,12 @@ def check_whisper_loaded(model_size: str): if asyncio.iscoroutinefunction(check_func): loaded = await check_func() else: - loaded = check_func() + result = check_func() + # Handle lambdas that return coroutines + if asyncio.iscoroutine(result): + loaded = await result + else: + loaded = result except Exception: loaded = False @@ -1389,7 +1394,12 @@ def check_whisper_loaded(model_size: str): if asyncio.iscoroutinefunction(check_func): loaded = await check_func() else: - loaded = check_func() + result = check_func() + # Handle lambdas that return coroutines + if asyncio.iscoroutine(result): + loaded = await result + else: + loaded = result except Exception: loaded = False diff --git a/backend/providers/base.py b/backend/providers/base.py index f3a6b4c2..50b05ce0 100644 --- a/backend/providers/base.py +++ b/backend/providers/base.py @@ -72,7 +72,7 @@ async def combine_voice_prompts( """ ... - async def load_model(self, model_size: str) -> None: + async def load_model_async(self, model_size: str) -> None: """Load TTS model.""" ... diff --git a/backend/providers/bundled.py b/backend/providers/bundled.py index b4a5e2ca..9c37e23c 100644 --- a/backend/providers/bundled.py +++ b/backend/providers/bundled.py @@ -55,14 +55,16 @@ async def combine_voice_prompts( backend = self._get_backend() return await backend.combine_voice_prompts(audio_paths, reference_texts) - async def load_model(self, model_size: str) -> None: + async def load_model_async(self, model_size: str) -> None: """Load TTS model.""" backend = self._get_backend() - # Backends use load_model_async, but Protocol defines load_model if hasattr(backend, 'load_model_async'): await backend.load_model_async(model_size) else: await backend.load_model(model_size) + + # Alias for compatibility + load_model = load_model_async def unload_model(self) -> None: """Unload model to free memory.""" diff --git a/backend/providers/local.py b/backend/providers/local.py index 004e6fc3..3ea4d75b 100644 --- a/backend/providers/local.py +++ b/backend/providers/local.py @@ -115,11 +115,14 @@ async def combine_voice_prompts( return mixed, combined_text - async def load_model(self, model_size: str) -> None: + async def load_model_async(self, model_size: str) -> None: """Load TTS model.""" # Track the requested model size - the provider server will load it # when generate() is called with this size self._current_model_size = model_size + + # Alias for compatibility + load_model = load_model_async def unload_model(self) -> None: """Unload model to free memory.""" diff --git a/bun.lock b/bun.lock index 9e08a825..3879020e 100644 --- a/bun.lock +++ b/bun.lock @@ -13,7 +13,7 @@ }, "app": { "name": "@voicebox/app", - "version": "0.1.11", + "version": "0.1.12", "dependencies": { "@dnd-kit/core": "^6.3.1", "@dnd-kit/sortable": "^10.0.0", @@ -26,6 +26,7 @@ "@radix-ui/react-label": "^2.1.0", "@radix-ui/react-popover": "^1.1.1", "@radix-ui/react-progress": "^1.1.0", + "@radix-ui/react-radio-group": "^1.2.0", "@radix-ui/react-scroll-area": "^1.1.0", "@radix-ui/react-select": "^2.1.1", "@radix-ui/react-separator": "^1.1.0", @@ -68,7 +69,7 @@ }, "landing": { "name": "@voicebox/landing", - "version": "0.1.11", + "version": "0.1.12", "dependencies": { "@radix-ui/react-separator": "^1.1.8", "@radix-ui/react-slot": "^1.2.4", @@ -93,7 +94,7 @@ }, "tauri": { "name": "@voicebox/tauri", - "version": "0.1.11", + "version": "0.1.12", "dependencies": { "@tauri-apps/api": "^2.0.0", "@tauri-apps/plugin-dialog": "^2.0.0", @@ -116,7 +117,7 @@ }, "web": { "name": "@voicebox/web", - "version": "0.1.11", + "version": "0.1.12", "dependencies": { "@tanstack/react-query": "^5.0.0", "react": "^18.3.0", @@ -407,6 +408,8 @@ "@radix-ui/react-progress": ["@radix-ui/react-progress@1.1.8", "", { "dependencies": { "@radix-ui/react-context": "1.1.3", "@radix-ui/react-primitive": "2.1.4" }, "peerDependencies": { "@types/react": "*", "@types/react-dom": "*", "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" }, "optionalPeers": ["@types/react", "@types/react-dom"] }, "sha512-+gISHcSPUJ7ktBy9RnTqbdKW78bcGke3t6taawyZ71pio1JewwGSJizycs7rLhGTvMJYCQB1DBK4KQsxs7U8dA=="], + "@radix-ui/react-radio-group": ["@radix-ui/react-radio-group@1.3.8", "", { "dependencies": { "@radix-ui/primitive": "1.1.3", "@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-context": "1.1.2", "@radix-ui/react-direction": "1.1.1", "@radix-ui/react-presence": "1.1.5", "@radix-ui/react-primitive": "2.1.3", "@radix-ui/react-roving-focus": "1.1.11", "@radix-ui/react-use-controllable-state": "1.2.2", "@radix-ui/react-use-previous": "1.1.1", "@radix-ui/react-use-size": "1.1.1" }, "peerDependencies": { "@types/react": "*", "@types/react-dom": "*", "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" }, "optionalPeers": ["@types/react", "@types/react-dom"] }, "sha512-VBKYIYImA5zsxACdisNQ3BjCBfmbGH3kQlnFVqlWU4tXwjy7cGX8ta80BcrO+WJXIn5iBylEH3K6ZTlee//lgQ=="], + "@radix-ui/react-roving-focus": ["@radix-ui/react-roving-focus@1.1.11", "", { "dependencies": { "@radix-ui/primitive": "1.1.3", "@radix-ui/react-collection": "1.1.7", "@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-context": "1.1.2", "@radix-ui/react-direction": "1.1.1", "@radix-ui/react-id": "1.1.1", "@radix-ui/react-primitive": "2.1.3", "@radix-ui/react-use-callback-ref": "1.1.1", "@radix-ui/react-use-controllable-state": "1.2.2" }, "peerDependencies": { "@types/react": "*", "@types/react-dom": "*", "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" }, "optionalPeers": ["@types/react", "@types/react-dom"] }, "sha512-7A6S9jSgm/S+7MdtNDSb+IU859vQqJ/QAtcYQcfFC6W8RS4IxIZDldLR0xqCFZ6DCyrQLjLPsxtTNch5jVA4lA=="], "@radix-ui/react-scroll-area": ["@radix-ui/react-scroll-area@1.2.10", "", { "dependencies": { "@radix-ui/number": "1.1.1", "@radix-ui/primitive": "1.1.3", "@radix-ui/react-compose-refs": "1.1.2", "@radix-ui/react-context": "1.1.2", "@radix-ui/react-direction": "1.1.1", "@radix-ui/react-presence": "1.1.5", "@radix-ui/react-primitive": "2.1.3", "@radix-ui/react-use-callback-ref": "1.1.1", "@radix-ui/react-use-layout-effect": "1.1.1" }, "peerDependencies": { "@types/react": "*", "@types/react-dom": "*", "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" }, "optionalPeers": ["@types/react", "@types/react-dom"] }, "sha512-tAXIa1g3sM5CGpVT0uIbUx/U3Gs5N8T52IICuCtObaos1S8fzsrPXG5WObkQN3S6NVl6wKgPhAIiBGbWnvc97A=="], diff --git a/tauri/src-tauri/Cargo.lock b/tauri/src-tauri/Cargo.lock index 4528097c..35b15188 100644 --- a/tauri/src-tauri/Cargo.lock +++ b/tauri/src-tauri/Cargo.lock @@ -5041,7 +5041,7 @@ checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "voicebox" -version = "0.1.11" +version = "0.1.12" dependencies = [ "base64 0.22.1", "core-foundation-sys", From dcbdf3e89b01b6b5784ed3e75b548ba42ec7c928 Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sat, 31 Jan 2026 20:18:32 -0800 Subject: [PATCH 09/33] =?UTF-8?q?Bump=20version:=200.1.12=20=E2=86=92=200.?= =?UTF-8?q?1.13?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- app/package.json | 2 +- backend/__init__.py | 2 +- landing/package.json | 2 +- package.json | 2 +- tauri/package.json | 2 +- tauri/src-tauri/Cargo.toml | 2 +- tauri/src-tauri/tauri.conf.json | 2 +- web/package.json | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 37ef7924..151a8ac0 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.12 +current_version = 0.1.13 commit = True tag = True tag_name = v{new_version} diff --git a/app/package.json b/app/package.json index 00170cca..0849a94b 100644 --- a/app/package.json +++ b/app/package.json @@ -1,6 +1,6 @@ { "name": "@voicebox/app", - "version": "0.1.12", + "version": "0.1.13", "private": true, "type": "module", "scripts": { diff --git a/backend/__init__.py b/backend/__init__.py index e75772bd..36b3fee7 100644 --- a/backend/__init__.py +++ b/backend/__init__.py @@ -1,3 +1,3 @@ # Backend package -__version__ = "0.1.12" +__version__ = "0.1.13" diff --git a/landing/package.json b/landing/package.json index 655e57a6..8af4d00e 100644 --- a/landing/package.json +++ b/landing/package.json @@ -1,6 +1,6 @@ { "name": "@voicebox/landing", - "version": "0.1.12", + "version": "0.1.13", "description": "Landing page for voicebox.sh", "scripts": { "dev": "bun --bun next dev --turbo", diff --git a/package.json b/package.json index c0f3c21e..f6af4cbd 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "voicebox", - "version": "0.1.12", + "version": "0.1.13", "private": true, "workspaces": [ "app", diff --git a/tauri/package.json b/tauri/package.json index 163f56c8..f7ec77ec 100644 --- a/tauri/package.json +++ b/tauri/package.json @@ -1,7 +1,7 @@ { "name": "@voicebox/tauri", "private": true, - "version": "0.1.12", + "version": "0.1.13", "type": "module", "scripts": { "dev": "vite", diff --git a/tauri/src-tauri/Cargo.toml b/tauri/src-tauri/Cargo.toml index 739dd34d..aa3b1a9c 100644 --- a/tauri/src-tauri/Cargo.toml +++ b/tauri/src-tauri/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "voicebox" -version = "0.1.12" +version = "0.1.13" description = "A production-quality desktop app for Qwen3-TTS voice cloning and generation" authors = ["you"] license = "" diff --git a/tauri/src-tauri/tauri.conf.json b/tauri/src-tauri/tauri.conf.json index 53b95d18..294d5049 100644 --- a/tauri/src-tauri/tauri.conf.json +++ b/tauri/src-tauri/tauri.conf.json @@ -1,7 +1,7 @@ { "$schema": "https://schema.tauri.app/config/2", "productName": "Voicebox", - "version": "0.1.12", + "version": "0.1.13", "identifier": "sh.voicebox.app", "build": { "beforeDevCommand": "bun run dev", diff --git a/web/package.json b/web/package.json index 99d82c56..1f44b3df 100644 --- a/web/package.json +++ b/web/package.json @@ -1,7 +1,7 @@ { "name": "@voicebox/web", "private": true, - "version": "0.1.12", + "version": "0.1.13", "type": "module", "scripts": { "dev": "vite", From 6dd5bb231198eb3ee8c42643030a2040d861da7c Mon Sep 17 00:00:00 2001 From: Jamie Pine Date: Sat, 31 Jan 2026 22:18:01 -0800 Subject: [PATCH 10/33] hugeicons --- app/package.json | 3 +- app/src/App.tsx | 11 ++--- .../components/AudioPlayer/AudioPlayer.tsx | 11 ++--- app/src/components/AudioTab/AudioTab.tsx | 27 ++++++------ .../Generation/FloatingGenerateBox.tsx | 9 ++-- .../components/Generation/GenerationForm.tsx | 7 +-- app/src/components/History/HistoryTable.tsx | 33 +++++++------- app/src/components/MainEditor/MainEditor.tsx | 7 +-- .../ServerSettings/ModelManagement.tsx | 30 ++++++------- .../ServerSettings/ModelProgress.tsx | 7 +-- .../ServerSettings/ProviderSettings.tsx | 17 ++++---- .../ServerSettings/ServerStatus.tsx | 7 +-- .../ServerSettings/UpdateStatus.tsx | 15 ++++--- app/src/components/Sidebar.tsx | 21 ++++----- .../components/StoriesTab/StoryChatItem.tsx | 13 +++--- .../components/StoriesTab/StoryContent.tsx | 7 +-- app/src/components/StoriesTab/StoryList.tsx | 13 +++--- .../StoriesTab/StoryTrackEditor.tsx | 41 +++++++++--------- .../VoiceProfiles/AudioSampleRecording.tsx | 13 +++--- .../VoiceProfiles/AudioSampleSystem.tsx | 13 +++--- .../VoiceProfiles/AudioSampleUpload.tsx | 11 ++--- .../components/VoiceProfiles/ProfileCard.tsx | 11 ++--- .../components/VoiceProfiles/ProfileForm.tsx | 17 ++++---- .../components/VoiceProfiles/ProfileList.tsx | 7 +-- .../components/VoiceProfiles/SampleList.tsx | 21 ++++----- .../components/VoiceProfiles/SampleUpload.tsx | 9 ++-- app/src/components/VoicesTab/VoicesTab.tsx | 13 +++--- app/src/components/ui/checkbox.tsx | 5 ++- app/src/components/ui/dialog.tsx | 5 ++- app/src/components/ui/dropdown-menu.tsx | 9 ++-- app/src/components/ui/multi-select.tsx | 7 +-- app/src/components/ui/radio-group.tsx | 5 ++- app/src/components/ui/select.tsx | 11 ++--- app/src/components/ui/toast.tsx | 5 ++- app/src/hooks/useAutoUpdater.ts | 2 +- app/src/hooks/useAutoUpdater.tsx | 7 +-- app/src/lib/api/types.ts | 1 + app/src/lib/hooks/useModelDownloadToast.tsx | 15 ++++--- backend/voicebox-server.spec | 6 +-- bun.lock | 14 ++++-- tauri/src-tauri/Cargo.lock | 2 +- tauri/src-tauri/gen/Assets.car | Bin 3847048 -> 3847048 bytes 42 files changed, 261 insertions(+), 227 deletions(-) diff --git a/app/package.json b/app/package.json index 0849a94b..07028409 100644 --- a/app/package.json +++ b/app/package.json @@ -17,6 +17,8 @@ "@dnd-kit/sortable": "^10.0.0", "@dnd-kit/utilities": "^3.2.2", "@hookform/resolvers": "^3.9.0", + "@hugeicons/core-free-icons": "^3.1.1", + "@hugeicons/react": "^1.1.4", "@radix-ui/react-alert-dialog": "^1.1.1", "@radix-ui/react-avatar": "^1.1.0", "@radix-ui/react-dialog": "^1.1.1", @@ -44,7 +46,6 @@ "clsx": "^2.1.1", "date-fns": "^3.6.0", "framer-motion": "^12.29.0", - "lucide-react": "^0.454.0", "motion": "^12.29.0", "react": "^18.3.0", "react-dom": "^18.3.0", diff --git a/app/src/App.tsx b/app/src/App.tsx index fbe29118..e54bc9dd 100644 --- a/app/src/App.tsx +++ b/app/src/App.tsx @@ -40,7 +40,7 @@ function App() { const serverStartingRef = useRef(false); // Automatically check for app updates on startup and show toast notifications - useAutoUpdater({ checkOnMount: true, showToast: true }); + useAutoUpdater(true); // Sync stored setting to Rust on startup useEffect(() => { @@ -82,8 +82,7 @@ function App() { console.log('Dev mode: Skipping auto-start of server (run it separately)'); setServerReady(true); // Mark as ready so UI doesn't show loading screen // Mark that server was not started by app (so we don't try to stop it on close) - // @ts-expect-error - adding property to window - window.__voiceboxServerStartedByApp = false; + (window as any).__voiceboxServerStartedByApp = false; return; } @@ -103,14 +102,12 @@ function App() { useServerStore.getState().setServerUrl(serverUrl); setServerReady(true); // Mark that we started the server (so we know to stop it on close) - // @ts-expect-error - adding property to window - window.__voiceboxServerStartedByApp = true; + (window as any).__voiceboxServerStartedByApp = true; }) .catch((error) => { console.error('Failed to auto-start server:', error); serverStartingRef.current = false; - // @ts-expect-error - adding property to window - window.__voiceboxServerStartedByApp = false; + (window as any).__voiceboxServerStartedByApp = false; }); // Cleanup: stop server on actual unmount (not StrictMode remount) diff --git a/app/src/components/AudioPlayer/AudioPlayer.tsx b/app/src/components/AudioPlayer/AudioPlayer.tsx index 48dd9e78..c49f7a38 100644 --- a/app/src/components/AudioPlayer/AudioPlayer.tsx +++ b/app/src/components/AudioPlayer/AudioPlayer.tsx @@ -1,5 +1,6 @@ import { useQuery } from '@tanstack/react-query'; -import { Pause, Play, Repeat, Volume2, VolumeX, X } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { PauseIcon, PlayIcon, RepeatIcon, VolumeHighIcon, VolumeMuteIcon, Cancel01Icon } from '@hugeicons/core-free-icons'; import { useEffect, useMemo, useRef, useState } from 'react'; import WaveSurfer from 'wavesurfer.js'; import { Button } from '@/components/ui/button'; @@ -832,7 +833,7 @@ export function AudioPlayer() { className="shrink-0" title={duration === 0 && !isLoading ? 'Audio not loaded' : ''} > - {isPlaying ? : } + {isPlaying ? : } {/* Waveform */} @@ -873,7 +874,7 @@ export function AudioPlayer() { className={isLooping ? 'text-primary' : ''} title="Toggle loop" > - + {/* Volume Control */} @@ -884,7 +885,7 @@ export function AudioPlayer() { onClick={() => setVolume(volume > 0 ? 0 : 1)} className="h-8 w-8" > - {volume > 0 ? : } + {volume > 0 ? : } - +
diff --git a/app/src/components/AudioTab/AudioTab.tsx b/app/src/components/AudioTab/AudioTab.tsx index f76e99d7..150e7660 100644 --- a/app/src/components/AudioTab/AudioTab.tsx +++ b/app/src/components/AudioTab/AudioTab.tsx @@ -1,5 +1,6 @@ import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; -import { Check, CheckCircle2, Edit, Plus, Speaker, Trash2 } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { CheckmarkCircle01Icon, CheckmarkCircle02Icon, Edit01Icon, Add01Icon, SpeakerIcon, Delete01Icon } from '@hugeicons/core-free-icons'; import { useState } from 'react'; import { Badge } from '@/components/ui/badge'; import { Button } from '@/components/ui/button'; @@ -135,7 +136,7 @@ export function AudioTab() {

Audio Channels

@@ -150,13 +151,13 @@ export function AudioTab() { > {allChannels.length === 0 ? (
- +

No audio channels yet. Create your first channel to route voices to specific devices.

@@ -178,7 +179,7 @@ export function AudioTab() {
- +

{channel.name}

@@ -235,7 +236,7 @@ export function AudioTab() { setEditingChannel(channel.id); }} > - +
)} @@ -325,10 +326,10 @@ export function AudioTab() { isConnected ? 'bg-accent border-accent' : 'border-muted-foreground/30', )} > - {isConnected && } + {isConnected && }
) : device.is_default ? ( - + ) : null} {device.name} @@ -339,7 +340,7 @@ export function AudioTab() {
) : (
- +

{platform.metadata.isTauri ? 'No audio devices found' : 'Audio device selection requires Tauri'}

@@ -494,7 +495,7 @@ function CreateChannelDialog({ open, onOpenChange, devices, onCreate }: CreateCh setSelectedDevices(selectedDevices.filter((id) => id !== deviceId)) } > - +
); @@ -602,7 +603,7 @@ function EditChannelDialog({ setSelectedDevices(selectedDevices.filter((id) => id !== deviceId)) } > - + ); @@ -648,7 +649,7 @@ function EditChannelDialog({ setSelectedVoices(selectedVoices.filter((id) => id !== profileId)) } > - + ); diff --git a/app/src/components/Generation/FloatingGenerateBox.tsx b/app/src/components/Generation/FloatingGenerateBox.tsx index b020a81f..dc835dfb 100644 --- a/app/src/components/Generation/FloatingGenerateBox.tsx +++ b/app/src/components/Generation/FloatingGenerateBox.tsx @@ -1,6 +1,7 @@ import { useMatchRoute } from '@tanstack/react-router'; import { AnimatePresence, motion } from 'framer-motion'; -import { Loader2, MessageSquare, Sparkles } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { Loading01Icon, TextSquareIcon, SparklesIcon } from '@hugeicons/core-free-icons'; import { useEffect, useRef, useState } from 'react'; import { Button } from '@/components/ui/button'; import { Form, FormControl, FormField, FormItem, FormMessage } from '@/components/ui/form'; @@ -301,9 +302,9 @@ export function FloatingGenerateBox({ size="icon" > {isPending ? ( - + ) : ( - + )} @@ -327,7 +328,7 @@ export function FloatingGenerateBox({ : 'bg-card border border-border hover:bg-background/50', )} > - + )} diff --git a/app/src/components/Generation/GenerationForm.tsx b/app/src/components/Generation/GenerationForm.tsx index 31b100f8..edf7b749 100644 --- a/app/src/components/Generation/GenerationForm.tsx +++ b/app/src/components/Generation/GenerationForm.tsx @@ -1,4 +1,5 @@ -import { Loader2, Mic } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { Loading01Icon, Mic01Icon } from '@hugeicons/core-free-icons'; import { Button } from '@/components/ui/button'; import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'; import { @@ -46,7 +47,7 @@ export function GenerationForm() { Voice Profile {selectedProfile ? (
- + {selectedProfile.name} {selectedProfile.language}
@@ -177,7 +178,7 @@ export function GenerationForm() { > {isPending ? ( <> - + Generating... ) : ( diff --git a/app/src/components/History/HistoryTable.tsx b/app/src/components/History/HistoryTable.tsx index e572f69e..0c889046 100644 --- a/app/src/components/History/HistoryTable.tsx +++ b/app/src/components/History/HistoryTable.tsx @@ -1,12 +1,13 @@ +import { HugeiconsIcon } from '@hugeicons/react'; import { - AudioWaveform, - Download, - FileArchive, - Loader2, - MoreHorizontal, - Play, - Trash2, -} from 'lucide-react'; + WaveIcon, + Download01Icon, + Archive01Icon, + Loading01Icon, + MoreHorizontalIcon, + PlayIcon, + Delete01Icon, +} from '@hugeicons/core-free-icons'; import { useEffect, useRef, useState } from 'react'; import { Button } from '@/components/ui/button'; import { @@ -222,7 +223,7 @@ export function HistoryTable() { if (isLoading && page === 0) { return (
- +
); } @@ -268,7 +269,7 @@ export function HistoryTable() { > {/* Waveform icon */}
- +
{/* Left side - Meta information */} @@ -310,28 +311,28 @@ export function HistoryTable() { className="h-8 w-8" aria-label="Actions" > - + handlePlay(gen.id, gen.text, gen.profile_id)} > - + Play handleDownloadAudio(gen.id, gen.text)} disabled={exportGenerationAudio.isPending} > - + Export Audio handleExportPackage(gen.id, gen.text)} disabled={exportGeneration.isPending} > - + Export Package - + Delete @@ -352,7 +353,7 @@ export function HistoryTable() { {/* Load more trigger element */} {hasMore && (
- {isFetching && } + {isFetching && }
)} diff --git a/app/src/components/MainEditor/MainEditor.tsx b/app/src/components/MainEditor/MainEditor.tsx index 9d597b1e..17125893 100644 --- a/app/src/components/MainEditor/MainEditor.tsx +++ b/app/src/components/MainEditor/MainEditor.tsx @@ -1,4 +1,5 @@ -import { Sparkles, Upload } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { SparklesIcon, Upload01Icon } from '@hugeicons/core-free-icons'; import { useRef, useState } from 'react'; import { FloatingGenerateBox } from '@/components/Generation/FloatingGenerateBox'; import { HistoryTable } from '@/components/History/HistoryTable'; @@ -89,7 +90,7 @@ export function MainEditor() {

Voicebox

diff --git a/app/src/components/ServerSettings/ModelManagement.tsx b/app/src/components/ServerSettings/ModelManagement.tsx index 4a5fd439..f7e10fdb 100644 --- a/app/src/components/ServerSettings/ModelManagement.tsx +++ b/app/src/components/ServerSettings/ModelManagement.tsx @@ -1,5 +1,6 @@ import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; -import { Download, Loader2, Trash2 } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { Download01Icon, Loading01Icon, Delete01Icon } from '@hugeicons/core-free-icons'; import { useCallback, useState } from 'react'; import { AlertDialog, @@ -67,11 +68,11 @@ export function ModelManagement() { const handleDownload = async (modelName: string) => { console.log('[Download] Button clicked for:', modelName, 'at', new Date().toISOString()); - + // Find display name const model = modelStatus?.models.find((m) => m.model_name === modelName); const displayName = model?.display_name || modelName; - + try { // IMPORTANT: Call the API FIRST before setting state // Setting state enables the SSE EventSource in useModelDownloadToast, @@ -79,11 +80,11 @@ export function ModelManagement() { console.log('[Download] Calling download API for:', modelName); const result = await apiClient.triggerModelDownload(modelName); console.log('[Download] Download API responded:', result); - + // NOW set state to enable SSE tracking (after download has started on backend) setDownloadingModel(modelName); setDownloadingDisplayName(displayName); - + // Download initiated successfully - state will be cleared when SSE reports completion // or by the polling interval detecting the model is downloaded queryClient.invalidateQueries({ queryKey: ['modelStatus'] }); @@ -117,7 +118,7 @@ export function ModelManagement() { // Invalidate AND explicitly refetch to ensure UI updates // Using refetchType: 'all' ensures we refetch even if the query is stale console.log('[Delete] Invalidating modelStatus query'); - await queryClient.invalidateQueries({ + await queryClient.invalidateQueries({ queryKey: ['modelStatus'], refetchType: 'all', }); @@ -153,7 +154,7 @@ export function ModelManagement() { {isLoading ? (
- +
) : modelStatus ? (
@@ -212,7 +213,6 @@ export function ModelManagement() { ))}
- ) : null}
@@ -246,7 +246,7 @@ export function ModelManagement() { > {deleteMutation.isPending ? ( <> - + Deleting... ) : ( @@ -265,20 +265,20 @@ interface ModelItemProps { model_name: string; display_name: string; downloaded: boolean; - downloading?: boolean; // From server - true if download in progress + downloading?: boolean; // From server - true if download in progress size_mb?: number; loaded: boolean; }; onDownload: () => void; onDelete: () => void; - isDownloading: boolean; // Local state - true if user just clicked download + isDownloading: boolean; // Local state - true if user just clicked download formatSize: (sizeMb?: number) => string; } function ModelItem({ model, onDownload, onDelete, isDownloading, formatSize }: ModelItemProps) { // Use server's downloading state OR local state (for immediate feedback before server updates) const showDownloading = model.downloading || isDownloading; - + return (
@@ -315,17 +315,17 @@ function ModelItem({ model, onDownload, onDelete, isDownloading, formatSize }: M disabled={model.loaded} title={model.loaded ? 'Unload model before deleting' : 'Delete model'} > - +
) : showDownloading ? ( ) : ( )} diff --git a/app/src/components/ServerSettings/ModelProgress.tsx b/app/src/components/ServerSettings/ModelProgress.tsx index 76aa99f1..30229c50 100644 --- a/app/src/components/ServerSettings/ModelProgress.tsx +++ b/app/src/components/ServerSettings/ModelProgress.tsx @@ -1,4 +1,5 @@ -import { Loader2, XCircle } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { Loading01Icon, CancelCircleIcon } from '@hugeicons/core-free-icons'; import { useEffect, useState } from 'react'; import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'; import { Progress } from '@/components/ui/progress'; @@ -74,10 +75,10 @@ export function ModelProgress({ modelName, displayName, isDownloading = false }: const getStatusIcon = () => { switch (progress.status) { case 'error': - return ; + return ; case 'downloading': case 'extracting': - return ; + return ; default: return null; } diff --git a/app/src/components/ServerSettings/ProviderSettings.tsx b/app/src/components/ServerSettings/ProviderSettings.tsx index c18874a9..7ae237aa 100644 --- a/app/src/components/ServerSettings/ProviderSettings.tsx +++ b/app/src/components/ServerSettings/ProviderSettings.tsx @@ -1,5 +1,6 @@ import { useMutation, useQuery, useQueryClient } from '@tanstack/react-query'; -import { Download, Loader2, Trash2 } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { Download01Icon, Loading01Icon, Delete01Icon } from '@hugeicons/core-free-icons'; import { useCallback, useState } from 'react'; import { AlertDialog, @@ -155,7 +156,7 @@ export function ProviderSettings() {
- +
@@ -217,10 +218,10 @@ export function ProviderSettings() { disabled={downloadingProvider === 'pytorch-cuda'} > {downloadingProvider === 'pytorch-cuda' ? ( - + ) : ( <> - + Download (2.4GB) )} @@ -241,7 +242,7 @@ export function ProviderSettings() { size="sm" variant="ghost" > - + )}
@@ -270,10 +271,10 @@ export function ProviderSettings() { disabled={downloadingProvider === 'pytorch-cpu'} > {downloadingProvider === 'pytorch-cpu' ? ( - + ) : ( <> - + Download (300MB) )} @@ -294,7 +295,7 @@ export function ProviderSettings() { size="sm" variant="ghost" > - + )} diff --git a/app/src/components/ServerSettings/ServerStatus.tsx b/app/src/components/ServerSettings/ServerStatus.tsx index 02a94ec2..8b5e5197 100644 --- a/app/src/components/ServerSettings/ServerStatus.tsx +++ b/app/src/components/ServerSettings/ServerStatus.tsx @@ -1,4 +1,5 @@ -import { Loader2, XCircle } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { Loading01Icon, CancelCircleIcon } from '@hugeicons/core-free-icons'; import { Badge } from '@/components/ui/badge'; import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'; import { useServerHealth } from '@/lib/hooks/useServer'; @@ -32,12 +33,12 @@ export function ServerStatus() { {isLoading ? (
- + Checking connection...
) : error ? (
- + Connection failed: {error.message}
) : health ? ( diff --git a/app/src/components/ServerSettings/UpdateStatus.tsx b/app/src/components/ServerSettings/UpdateStatus.tsx index a3d832aa..5cab320e 100644 --- a/app/src/components/ServerSettings/UpdateStatus.tsx +++ b/app/src/components/ServerSettings/UpdateStatus.tsx @@ -1,4 +1,5 @@ -import { AlertCircle, Download, RefreshCw } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { AlertCircleIcon, Download01Icon, Refresh01Icon } from '@hugeicons/core-free-icons'; import { useEffect, useState } from 'react'; import { Badge } from '@/components/ui/badge'; import { Button } from '@/components/ui/button'; @@ -36,21 +37,21 @@ export function UpdateStatus() { variant="outline" size="sm" > - + Check for Updates {status.checking && (
- + Checking for updates...
)} {status.error && (
- + {status.error}
)} @@ -65,7 +66,7 @@ export function UpdateStatus() { New @@ -75,7 +76,7 @@ export function UpdateStatus() {
- + Downloading update...
{status.downloadProgress !== undefined && ( @@ -109,7 +110,7 @@ export function UpdateStatus() { your convenience.
diff --git a/app/src/components/Sidebar.tsx b/app/src/components/Sidebar.tsx index a849344f..04442db0 100644 --- a/app/src/components/Sidebar.tsx +++ b/app/src/components/Sidebar.tsx @@ -1,5 +1,6 @@ import { Link, useMatchRoute } from '@tanstack/react-router'; -import { Box, BookOpen, Loader2, Mic, Server, Speaker, Volume2 } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { PackageIcon, Book01Icon, Loading01Icon, Mic01Icon, McpServerIcon, SpeakerIcon, VolumeHighIcon } from '@hugeicons/core-free-icons'; import voiceboxLogo from '@/assets/voicebox-logo.png'; import { cn } from '@/lib/utils/cn'; import { useGenerationStore } from '@/stores/generationStore'; @@ -10,12 +11,12 @@ interface SidebarProps { } const tabs = [ - { id: 'main', path: '/', icon: Volume2, label: 'Generate' }, - { id: 'stories', path: '/stories', icon: BookOpen, label: 'Stories' }, - { id: 'voices', path: '/voices', icon: Mic, label: 'Voices' }, - { id: 'audio', path: '/audio', icon: Speaker, label: 'Audio' }, - { id: 'models', path: '/models', icon: Box, label: 'Models' }, - { id: 'server', path: '/server', icon: Server, label: 'Server' }, + { id: 'main', path: '/', icon: VolumeHighIcon, label: 'Generate' }, + { id: 'stories', path: '/stories', icon: Book01Icon, label: 'Stories' }, + { id: 'voices', path: '/voices', icon: Mic01Icon, label: 'Voices' }, + { id: 'audio', path: '/audio', icon: SpeakerIcon, label: 'Audio' }, + { id: 'models', path: '/models', icon: PackageIcon, label: 'Models' }, + { id: 'server', path: '/server', icon: McpServerIcon, label: 'Server' }, ]; export function Sidebar({ isMacOS }: SidebarProps) { @@ -43,7 +44,7 @@ export function Sidebar({ isMacOS }: SidebarProps) { // For index route, use exact match; for others, use default matching const isActive = tab.path === '/' - ? matchRoute({ to: '/', exact: true }) + ? matchRoute({ to: '/' }) : matchRoute({ to: tab.path }); return ( @@ -58,7 +59,7 @@ export function Sidebar({ isMacOS }: SidebarProps) { title={tab.label} aria-label={tab.label} > - + ); })} @@ -75,7 +76,7 @@ export function Sidebar({ isMacOS }: SidebarProps) { isPlayerVisible ? 'mb-[120px]' : 'mb-0', )} > - + )} diff --git a/app/src/components/StoriesTab/StoryChatItem.tsx b/app/src/components/StoriesTab/StoryChatItem.tsx index 19fa2249..c1dd793f 100644 --- a/app/src/components/StoriesTab/StoryChatItem.tsx +++ b/app/src/components/StoriesTab/StoryChatItem.tsx @@ -1,6 +1,7 @@ import { useSortable } from '@dnd-kit/sortable'; import { CSS } from '@dnd-kit/utilities'; -import { GripVertical, Mic, MoreHorizontal, Play, Trash2 } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { DragDropVerticalIcon, Mic01Icon, MoreHorizontalIcon, PlayIcon, Delete01Icon } from '@hugeicons/core-free-icons'; import { useState } from 'react'; import { Button } from '@/components/ui/button'; import { @@ -74,7 +75,7 @@ export function StoryChatItem({ className="shrink-0 cursor-grab active:cursor-grabbing touch-none text-muted-foreground hover:text-foreground transition-colors" {...dragHandleProps} > - + )} @@ -92,7 +93,7 @@ export function StoryChatItem({ onError={() => setAvatarError(true)} /> ) : ( - + )} @@ -119,16 +120,16 @@ export function StoryChatItem({ - + Play from here - + Remove from Story diff --git a/app/src/components/StoriesTab/StoryContent.tsx b/app/src/components/StoriesTab/StoryContent.tsx index 483e6657..518c42a0 100644 --- a/app/src/components/StoriesTab/StoryContent.tsx +++ b/app/src/components/StoriesTab/StoryContent.tsx @@ -13,7 +13,8 @@ import { sortableKeyboardCoordinates, verticalListSortingStrategy, } from '@dnd-kit/sortable'; -import { Download, Plus } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { Download01Icon, Add01Icon } from '@hugeicons/core-free-icons'; import { useEffect, useMemo, useRef, useState } from 'react'; import { Button } from '@/components/ui/button'; import { Input } from '@/components/ui/input'; @@ -271,7 +272,7 @@ export function StoryContent() { @@ -316,7 +317,7 @@ export function StoryContent() { onClick={handleExportAudio} disabled={exportAudio.isPending} > - + Export Audio )} diff --git a/app/src/components/StoriesTab/StoryList.tsx b/app/src/components/StoriesTab/StoryList.tsx index ebbd6616..a283a34a 100644 --- a/app/src/components/StoriesTab/StoryList.tsx +++ b/app/src/components/StoriesTab/StoryList.tsx @@ -1,4 +1,5 @@ -import { Plus, BookOpen, MoreHorizontal, Pencil, Trash2 } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { Add01Icon, Book01Icon, MoreHorizontalIcon, PencilIcon, Delete01Icon } from '@hugeicons/core-free-icons'; import { useState } from 'react'; import { AlertDialog, @@ -177,7 +178,7 @@ export function StoryList() {

Stories

@@ -186,7 +187,7 @@ export function StoryList() {
{storyList.length === 0 ? (
- +

No stories yet

Create your first story to get started

@@ -227,19 +228,19 @@ export function StoryList() { className="h-8 w-8 opacity-0 group-hover:opacity-100 transition-opacity" onClick={(e) => e.stopPropagation()} > - + handleEditClick(story)}> - + Edit handleDeleteClick(story.id)} className="text-destructive focus:text-destructive" > - + Delete diff --git a/app/src/components/StoriesTab/StoryTrackEditor.tsx b/app/src/components/StoriesTab/StoryTrackEditor.tsx index 74dbde25..95a10296 100644 --- a/app/src/components/StoriesTab/StoryTrackEditor.tsx +++ b/app/src/components/StoriesTab/StoryTrackEditor.tsx @@ -1,14 +1,15 @@ +import { HugeiconsIcon } from '@hugeicons/react'; import { - Copy, - GripHorizontal, - Minus, - Pause, - Play, - Plus, - Scissors, - Square, - Trash2, -} from 'lucide-react'; + Copy01Icon, + DragDropHorizontalIcon, + RemoveIcon, + PauseIcon, + PlayIcon, + Add01Icon, + Scissor01Icon, + SquareIcon, + Delete01Icon, +} from '@hugeicons/core-free-icons'; import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; import WaveSurfer from 'wavesurfer.js'; import { Button } from '@/components/ui/button'; @@ -723,7 +724,7 @@ export function StoryTrackEditor({ storyId, items }: StoryTrackEditorProps) { onMouseDown={handleResizeStart} aria-label="Resize track editor" > - + {/* Toolbar */} @@ -737,7 +738,7 @@ export function StoryTrackEditor({ storyId, items }: StoryTrackEditorProps) { onClick={handlePlayPause} title="Play/Pause (Space)" > - {isCurrentlyPlaying ? : } + {isCurrentlyPlaying ? : } {formatTime(currentTimeMs)} / {formatTime(totalDurationMs)} @@ -763,7 +764,7 @@ export function StoryTrackEditor({ storyId, items }: StoryTrackEditorProps) { onClick={handleSplit} title="Split at playhead (S)" > - +
)} @@ -790,10 +791,10 @@ export function StoryTrackEditor({ storyId, items }: StoryTrackEditorProps) {
Zoom:
@@ -837,7 +838,7 @@ export function StoryTrackEditor({ storyId, items }: StoryTrackEditorProps) { type="button" className="h-6 border-b bg-muted/20 sticky top-0 z-10 cursor-pointer text-left" style={{ width: `${timelineWidth}px` }} - onClick={handleTimelineClick} + onClick={(e) => handleTimelineClick(e as unknown as React.MouseEvent)} aria-label="Seek timeline" > {timeMarkers.map((ms) => ( @@ -878,7 +879,7 @@ export function StoryTrackEditor({ storyId, items }: StoryTrackEditorProps) {

@@ -122,7 +123,7 @@ export function AudioSampleRecording({ onClick={onStop} className="relative z-10 flex items-center gap-2 bg-accent text-accent-foreground hover:bg-accent/90" > - + Stop Recording

@@ -134,13 +135,13 @@ export function AudioSampleRecording({ {file && !isRecording && (

- + Recording complete

File: {file.name}

@@ -60,7 +61,7 @@ export function AudioSampleSystem({ variant="destructive" className="flex items-center gap-2" > - + Stop Capture

@@ -72,13 +73,13 @@ export function AudioSampleSystem({ {file && !isRecording && (

- + Capture complete

File: {file.name}

@@ -99,7 +100,7 @@ export function AudioSampleUpload({ ) : ( <>

- + File uploaded

File: {file.name}

@@ -111,7 +112,7 @@ export function AudioSampleUpload({ onClick={onPlayPause} disabled={isValidating} > - {isPlaying ? : } + {isPlaying ? : }
{profile.name} @@ -101,13 +102,13 @@ export function ProfileCard({ profile }: ProfileCardProps) {
} onClick={handleExport} disabled={exportProfile.isPending} aria-label="Export profile" /> } onClick={(e) => { e.stopPropagation(); handleEdit(); @@ -115,7 +116,7 @@ export function ProfileCard({ profile }: ProfileCardProps) { aria-label="Edit profile" /> } onClick={handleDeleteClick} disabled={deleteProfile.isPending} aria-label="Delete profile" diff --git a/app/src/components/VoiceProfiles/ProfileForm.tsx b/app/src/components/VoiceProfiles/ProfileForm.tsx index f4fc5711..3fcd987e 100644 --- a/app/src/components/VoiceProfiles/ProfileForm.tsx +++ b/app/src/components/VoiceProfiles/ProfileForm.tsx @@ -1,5 +1,6 @@ import { zodResolver } from '@hookform/resolvers/zod'; -import { Edit2, Mic, Monitor, Upload, X } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { Edit02Icon, Mic01Icon, DeskIcon, Upload01Icon, Cancel01Icon } from '@hugeicons/core-free-icons'; import { useEffect, useRef, useState } from 'react'; import { useForm } from 'react-hook-form'; import * as z from 'zod'; @@ -635,7 +636,7 @@ export function ProfileForm() { setSampleMode('record'); }} > - + Discard
@@ -668,16 +669,16 @@ export function ProfileForm() { className={`grid w-full ${platform.metadata.isTauri && isSystemAudioSupported ? 'grid-cols-3' : 'grid-cols-2'}`} > - + Upload - + Record {platform.metadata.isTauri && isSystemAudioSupported && ( - + System Audio )} @@ -798,7 +799,7 @@ export function ProfileForm() { className="h-full w-full object-cover" /> ) : ( - + )}
{(avatarPreview || editingProfile?.avatar_path) && ( )}
diff --git a/app/src/components/VoiceProfiles/ProfileList.tsx b/app/src/components/VoiceProfiles/ProfileList.tsx index 8dcb06a4..4e57bfdb 100644 --- a/app/src/components/VoiceProfiles/ProfileList.tsx +++ b/app/src/components/VoiceProfiles/ProfileList.tsx @@ -1,4 +1,5 @@ -import { Mic, Sparkles } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { Mic01Icon, SparklesIcon } from '@hugeicons/core-free-icons'; import { Button } from '@/components/ui/button'; import { Card, CardContent } from '@/components/ui/card'; import { useProfiles } from '@/lib/hooks/useProfiles'; @@ -30,12 +31,12 @@ export function ProfileList() { {allProfiles.length === 0 ? ( - +

No voice profiles yet. Create your first profile to get started.

diff --git a/app/src/components/VoiceProfiles/SampleList.tsx b/app/src/components/VoiceProfiles/SampleList.tsx index 19aa1ca8..17664afd 100644 --- a/app/src/components/VoiceProfiles/SampleList.tsx +++ b/app/src/components/VoiceProfiles/SampleList.tsx @@ -1,4 +1,5 @@ -import { Check, Edit, Pause, Play, Plus, Trash2, Volume2, X } from 'lucide-react'; +import { HugeiconsIcon } from '@hugeicons/react'; +import { CheckmarkCircle01Icon, Edit01Icon, PauseIcon, PlayIcon, Add01Icon, Delete01Icon, VolumeHighIcon, Cancel01Icon } from '@hugeicons/core-free-icons'; import { useEffect, useRef, useState } from 'react'; import { Button } from '@/components/ui/button'; import { CircleButton } from '@/components/ui/circle-button'; @@ -103,7 +104,7 @@ function MiniSamplePlayer({ audioUrl }: MiniSamplePlayerProps) { onClick={handlePlayPause} disabled={isLoading} > - {isPlaying ? : } + {isPlaying ? : }
@@ -129,7 +130,7 @@ function MiniSamplePlayer({ audioUrl }: MiniSamplePlayerProps) { onClick={handleStop} title="Stop" > - +
@@ -209,7 +210,7 @@ export function SampleList({ profileId }: SampleListProps) {
{samples && samples.length === 0 ? (
- +

No samples yet

Add your first audio sample to get started @@ -232,7 +233,7 @@ export function SampleList({ profileId }: SampleListProps) { /* Edit Mode */

- + Editing transcription