diff --git a/.gitignore b/.gitignore index 1b6d4c0..d6d110b 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ src-tauri/target/ src-tauri/resources/embedded/ .runtime-stage/ releases/ +assets/ src-tauri/gen/ .env .env.local diff --git a/CHANGELOG.md b/CHANGELOG.md index 816859b..ccb3fa4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## v0.6.0 - 2026-04-19 + +- Renamed the local `compression/` package to `cache_compression/` so it no longer shadows Python 3.14's PEP 784 stdlib `compression` namespace package. Fixes a `ModuleNotFoundError: No module named 'compression._common'` surfacing on Windows with Python 3.14 when PyTorch's import chain reached into the shadowed package. +- Made the My Models library RAM estimate use the actual on-disk size + KV cache heuristic instead of the catalog flagship's `estimatedMemoryGb`, so differently-sized variants of the same family no longer all render as the same ~76 GB value. Added a parallel compressed-cache estimate for the Compressed column. +- Video diffusion models (HunyuanVideo, Mochi, Wan2.x, LTX-Video, CogVideo, etc.) are now tagged `modelType="video"` during discovery and kept out of the chat-oriented My Models list and chat picker. They continue to surface under the dedicated Video section. +- Video-gen memory safety now includes the model footprint (with device-class fragmentation factors) in the safety verdict, preventing the 40-frame Wan 2.1 T2V 1.3B MPS crash on 64 GB Macs. +- Hardened Windows staging: `scripts/stage-runtime.mjs` now clears read-only attributes and retries on transient EPERM/EBUSY during `.runtime-stage` cleanup, and skips the dev-mode tar archive that Tauri ignores anyway. `build.ps1` pre-clears stale staging and installs the project via `pip install -e ".[desktop,images]"` so strict validation has its required extras. +- Bumped the application version to `0.6.0` across the npm, Python, and Tauri package metadata. + ## v0.5.3 - 2026-04-18 - Fixed the GitHub Actions release workflow to use the valid `includeUpdaterJson` input for `tauri-apps/tauri-action@v0.6.0`, removing the repeated `uploadUpdaterJson` warnings from release builds. diff --git a/CLAUDE.md b/CLAUDE.md index b2b4799..c69fb0e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -7,7 +7,7 @@ ChaosEngineAI is a desktop AI inference app built with: - **Desktop shell**: Tauri (Rust) — `src-tauri/` - **Backend**: Python FastAPI sidecar — `backend_service/` - **Inference engines**: MLX (Apple Silicon), llama.cpp (GGUF), vLLM (CUDA) -- **Cache strategies**: Pluggable compression via `compression/` registry +- **Cache strategies**: Pluggable compression via `cache_compression/` registry ### Key Directories @@ -19,7 +19,7 @@ ChaosEngineAI is a desktop AI inference app built with: | `backend_service/inference.py` | Core inference engine — model loading, binary routing, generation | | `backend_service/routes/` | API endpoints (14 route modules) | | `backend_service/helpers/` | System stats, settings, persistence, cache estimation | -| `compression/` | Cache strategy registry + adapters (native, rotorquant, turboquant, chaosengine, triattention) | +| `cache_compression/` | Cache strategy registry + adapters (native, rotorquant, turboquant, chaosengine, triattention). Renamed from `compression/` so it doesn't shadow Python 3.14's stdlib `compression` namespace package. | | `dflash/` | DFlash speculative decoding — draft model registry + availability detection | | `scripts/` | Build, install, and update scripts | | `tests/` | Python tests (pytest) | @@ -57,10 +57,11 @@ Check for updates to external repos we build from or depend on: | llama.cpp (standard) | `ggml-org/llama.cpp` | `master` | `git -C ../llama.cpp fetch && git -C ../llama.cpp log HEAD..origin/master --oneline` | | llama-server-turbo | `TheTom/llama-cpp-turboquant` | `feature/turboquant-kv-cache` | `git ls-remote https://github.com/TheTom/llama-cpp-turboquant.git refs/heads/feature/turboquant-kv-cache` | | ChaosEngine | `cryptopoly/ChaosEngine` | `main` | `git -C vendor/ChaosEngine fetch && git -C vendor/ChaosEngine log HEAD..origin/main --oneline` | -| dflash-mlx | `bstnxbt/dflash-mlx` | — | `.venv/bin/pip index versions dflash-mlx 2>/dev/null` | +| dflash-mlx | `bstnxbt/dflash-mlx` | `main` pinned to commit `f825ffb2` (upstream deleted all tags April 2026) | `git ls-remote https://github.com/bstnxbt/dflash-mlx.git refs/heads/main` | | turboquant | `back2matching/turboquant` | — | `.venv/bin/pip index versions turboquant 2>/dev/null` | -| turboquant-mlx | `sharpner/turboquant-mlx` | — | `.venv/bin/pip index versions turboquant-mlx 2>/dev/null` | +| turboquant-mlx | `arozanov/turboquant-mlx` | — | `.venv/bin/pip index versions turboquant-mlx 2>/dev/null` | | turboquant-mlx-full | `helgklaizar/turboquant_mlx` | — | `.venv/bin/pip index versions turboquant-mlx-full 2>/dev/null` | +| DDTree (ported algorithm) | `liranringel/ddtree` | `main` | `git ls-remote https://github.com/liranringel/ddtree.git HEAD` | ### 4. Cache Strategy Health - [ ] ChaosEngine `llama_cpp_cache_flags()` only emits standard types: `f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1` @@ -81,7 +82,7 @@ Check for updates to external repos we build from or depend on: | Area | Test File(s) | Command | |------|-------------|---------| -| Cache strategies (`compression/`) | `test_cache_strategies.py` | `pytest tests/test_cache_strategies.py -v` | +| Cache strategies (`cache_compression/`) | `test_cache_strategies.py` | `pytest tests/test_cache_strategies.py -v` | | DFlash / speculative decoding | `test_dflash.py` | `pytest tests/test_dflash.py -v` | | Inference / llama.cpp / binary routing | `test_inference.py` | `pytest tests/test_inference.py -v` | | Setup routes / install endpoints | `test_setup_routes.py` | `pytest tests/test_setup_routes.py -v` | diff --git a/README.md b/README.md index bde8415..7066105 100644 --- a/README.md +++ b/README.md @@ -298,7 +298,7 @@ ChaosEngineAI is three cooperating layers: - **`src-tauri/`** — Tauri 2 Rust shell + bundled runtime. - **`backend_service/`** — Python service that owns model lifecycle, the warm pool, the OpenAI-compatible API, the benchmark runner, and speculative decoding (DFlash + DDTree). - **`backend_service/routes/`** — FastAPI routes for chat, prompts, compare mode, benchmarks, plugins, images, server controls, and settings. -- **`compression/`** — Pluggable cache/compression strategy system. Ships with native f16 and optional adapters for [RotorQuant](https://github.com/scrya-com/rotorquant), [TriAttention](https://github.com/WeianMao/triattention), [TurboQuant](https://pypi.org/project/turboquant-mlx/), and [ChaosEngine](https://github.com/cryptopoly/ChaosEngine). +- **`cache_compression/`** — Pluggable cache/compression strategy system. Ships with native f16 and optional adapters for [RotorQuant](https://github.com/scrya-com/rotorquant), [TriAttention](https://github.com/WeianMao/triattention), [TurboQuant](https://pypi.org/project/turboquant-mlx/), and [ChaosEngine](https://github.com/cryptopoly/ChaosEngine). - **`dflash/`** — DFlash speculative decoding integration: draft model registry, fuzzy matching for quantized variants, MLX and vLLM backend detection. --- @@ -317,7 +317,7 @@ ChaosEngineAI uses a pluggable cache strategy system. Out of the box, models run Install optional backends into the backend runtime (`./.venv/bin/python3 -m pip install ...`), then restart ChaosEngineAI. TriAttention is Linux/CUDA only, the current PyPI `turboquant-mlx` package may still leave TurboQuant disabled in the current build, and ChaosEngine can now be bundled directly into desktop builds by checking out `vendor/ChaosEngine` (or setting `CHAOSENGINE_VENDOR_PATH`) before `npm run stage:runtime`. Source/dev installs can still use the local editable install from GitHub. -The system is designed so new compression methods can be added as single-file adapters in `compression/` without touching any other code. +The system is designed so new compression methods can be added as single-file adapters in `cache_compression/` without touching any other code. --- @@ -390,7 +390,7 @@ ChaosEngineAI/ ├── src-tauri/ Tauri Rust shell + bundled runtime ├── scripts/ Build, release + runtime staging scripts ├── backend_service/ Python backend (engine adapters + HTTP server) -├── compression/ Pluggable cache/compression strategy adapters +├── cache_compression/ Pluggable cache/compression strategy adapters ├── dflash/ DFlash/DDTree speculative decoding integration ├── vendor/ChaosEngine/ ChaosEngine compression (git submodule) ├── tests/ Backend integration tests diff --git a/THIRD_PARTY_NOTICES.md b/THIRD_PARTY_NOTICES.md index c8ce065..9618fb7 100644 --- a/THIRD_PARTY_NOTICES.md +++ b/THIRD_PARTY_NOTICES.md @@ -78,3 +78,17 @@ If installed by the user, each is subject to its own licence: These libraries are **not bundled** with ChaosEngineAI. They are optional pip dependencies that the user may install independently. + +--- + +## Ported Algorithms + +### DDTree (Diffusion Draft Tree) + +- **Upstream:** +- **Licence:** MIT +- **Port location:** `backend_service/ddtree.py` +- **Usage:** The tree-building and tree-mask compilation logic is ported + to ChaosEngineAI's MLX runtime. The draft model bundle is reused from + DFlash. No upstream code is bundled verbatim; this is a re-implementation + of the published algorithm. diff --git a/backend_service/app.py b/backend_service/app.py index 7d2ccd5..69aec03 100644 --- a/backend_service/app.py +++ b/backend_service/app.py @@ -18,7 +18,12 @@ ImageGenerationConfig, ImageRuntimeManager, ) -from backend_service.models import ImageGenerationRequest +from backend_service.video_runtime import ( + VideoGenerationConfig, + VideoRuntimeManager, + start_torch_warmup, +) +from backend_service.models import ImageGenerationRequest, VideoGenerationRequest from backend_service.routes import register_routes from backend_service.state import ChaosEngineState @@ -35,6 +40,12 @@ _find_image_output as _find_image_output_impl, _delete_image_output as _delete_image_output_impl, ) +from backend_service.helpers.video import ( + _load_video_outputs as _load_video_outputs_impl, + _save_video_artifact as _save_video_artifact_impl, + _find_video_output as _find_video_output_impl, + _delete_video_output as _delete_video_output_impl, +) from backend_service.helpers.settings import ( DataLocation, _default_settings as _default_settings_impl, @@ -55,9 +66,15 @@ WORKSPACE_ROOT = Path(__file__).resolve().parents[1] APP_STARTED_AT = time.time() HF_SNAPSHOT_DOWNLOAD_HELPER = ( - "import sys\n" + "import json, sys\n" "from huggingface_hub import snapshot_download\n" - "snapshot_download(repo_id=sys.argv[1], resume_download=True)\n" + "repo_id = sys.argv[1]\n" + "raw_allow = sys.argv[2] if len(sys.argv) > 2 else ''\n" + "allow_patterns = json.loads(raw_allow) if raw_allow else None\n" + "kwargs = {'repo_id': repo_id, 'resume_download': True}\n" + "if allow_patterns:\n" + " kwargs['allow_patterns'] = allow_patterns\n" + "snapshot_download(**kwargs)\n" ) DEFAULT_PORT = int(os.getenv("CHAOSENGINE_PORT", "8876")) DEFAULT_HOST = os.getenv("CHAOSENGINE_HOST", "127.0.0.1") @@ -72,6 +89,7 @@ CHAT_SESSIONS_PATH = DATA_LOCATION.chat_sessions_path DOCUMENTS_DIR = DATA_LOCATION.documents_dir IMAGE_OUTPUTS_DIR = DATA_LOCATION.image_outputs_dir +VIDEO_OUTPUTS_DIR = DATA_LOCATION.video_outputs_dir MAX_DOC_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB per file MAX_SESSION_DOCS_BYTES = 200 * 1024 * 1024 # 200 MB per session DOC_ALLOWED_EXTENSIONS = { @@ -93,6 +111,7 @@ EXEMPT_AUTH_PATHS = frozenset({ "/api/health", "/api/auth/session", + "/api/system/gpu-status", }) @@ -133,20 +152,76 @@ def _save_chat_sessions(sessions: list[dict[str, Any]], path: Path = CHAT_SESSIO return _save_chat_sessions_impl(sessions, path) +def _resolve_output_dir_override(raw: str, default: Path) -> Path: + """Return the user-chosen output directory, or the default. + + Empty / whitespace-only strings restore the default. A non-empty value is + expanded (``~`` → home), resolved to an absolute path, and the directory is + created if missing. If creation fails (path is unwritable, on a missing + volume, etc.) we transparently fall back to ``default`` so generation never + crashes just because the user pointed at a stale Dropbox folder. + """ + value = (raw or "").strip() + if not value: + return default + try: + candidate = Path(os.path.expanduser(value)).resolve() + candidate.mkdir(parents=True, exist_ok=True) + return candidate + except OSError: + return default + + +def _current_image_outputs_dir() -> Path: + # The module-level ``IMAGE_OUTPUTS_DIR`` is the install-time default and + # the override target tests use to redirect output into a tempdir. Anything + # the user typed in Settings takes precedence — but only when actually set, + # so test patches still win when no setting is configured. + settings = _load_settings() + return _resolve_output_dir_override( + str(settings.get("imageOutputsDirectory") or ""), + IMAGE_OUTPUTS_DIR, + ) + + +def _current_video_outputs_dir() -> Path: + settings = _load_settings() + return _resolve_output_dir_override( + str(settings.get("videoOutputsDirectory") or ""), + VIDEO_OUTPUTS_DIR, + ) + + def _load_image_outputs() -> list[dict[str, Any]]: - return _load_image_outputs_impl(IMAGE_OUTPUTS_DIR) + return _load_image_outputs_impl(_current_image_outputs_dir()) def _save_image_artifact(artifact: dict[str, Any]) -> dict[str, Any]: - return _save_image_artifact_impl(artifact, IMAGE_OUTPUTS_DIR) + return _save_image_artifact_impl(artifact, _current_image_outputs_dir()) def _find_image_output(artifact_id: str) -> dict[str, Any] | None: - return _find_image_output_impl(artifact_id, IMAGE_OUTPUTS_DIR) + return _find_image_output_impl(artifact_id, _current_image_outputs_dir()) def _delete_image_output(artifact_id: str) -> bool: - return _delete_image_output_impl(artifact_id, IMAGE_OUTPUTS_DIR) + return _delete_image_output_impl(artifact_id, _current_image_outputs_dir()) + + +def _load_video_outputs() -> list[dict[str, Any]]: + return _load_video_outputs_impl(_current_video_outputs_dir()) + + +def _save_video_artifact(artifact: dict[str, Any]) -> dict[str, Any]: + return _save_video_artifact_impl(artifact, _current_video_outputs_dir()) + + +def _find_video_output(artifact_id: str) -> dict[str, Any] | None: + return _find_video_output_impl(artifact_id, _current_video_outputs_dir()) + + +def _delete_video_output(artifact_id: str) -> bool: + return _delete_video_output_impl(artifact_id, _current_video_outputs_dir()) def compute_cache_preview( @@ -196,6 +271,16 @@ def _resolve_api_token(explicit_token: str | None = None) -> str: return token or secrets.token_urlsafe(32) +def _resolve_require_api_auth(settings: dict[str, Any]) -> bool: + # Env var wins — useful for CI / headless scripts that need to drop + # the bearer requirement without touching settings.json. Accepts any + # of "0", "false", "no", "off" (case-insensitive) to disable. + env_override = os.getenv("CHAOSENGINE_REQUIRE_AUTH") + if env_override is not None: + return env_override.strip().lower() not in {"0", "false", "no", "off", ""} + return bool(settings.get("requireApiAuth", True)) + + def _is_loopback_host(host: str | None) -> bool: if not host: return False @@ -228,7 +313,7 @@ def _hf_repo_from_link(link: str | None) -> str | None: def _get_cache_strategies() -> list[dict[str, Any]]: - from compression import registry + from cache_compression import registry return registry.available() @@ -284,6 +369,74 @@ def _generate_image_artifacts( return artifacts, runtime_status +def _generate_video_artifact( + request: VideoGenerationRequest, + variant: dict[str, Any], + runtime_manager: VideoRuntimeManager, +) -> tuple[dict[str, Any], dict[str, Any]]: + """Run a single video generation and persist it to the outputs dir. + + Returns ``(artifact_dict, runtime_status_dict)``. Unlike the image path, + there is no placeholder fallback — if the runtime isn't ready or the + generation fails, the caller sees the exception and surfaces a proper + HTTP error rather than a fake clip. + """ + import logging + logger = logging.getLogger("chaosengine.video") + logger.info( + "Generating video: model=%s repo=%s size=%dx%d frames=%d steps=%d", + variant.get("name"), + variant.get("repo"), + request.width, + request.height, + request.numFrames, + request.steps, + ) + + video, runtime_status = runtime_manager.generate( + VideoGenerationConfig( + modelId=request.modelId, + modelName=str(variant["name"]), + repo=str(variant["repo"]), + prompt=request.prompt, + negativePrompt=request.negativePrompt or "", + width=request.width, + height=request.height, + numFrames=request.numFrames, + fps=request.fps, + steps=request.steps, + guidance=request.guidance, + seed=request.seed, + ) + ) + + created_at = datetime.utcnow().replace(microsecond=0).isoformat() + "Z" + clip_duration = round(video.frameCount / max(1, video.fps), 3) + artifact = { + "artifactId": f"vid-{uuid.uuid4().hex[:12]}", + "modelId": request.modelId, + "modelName": variant["name"], + "prompt": request.prompt, + "negativePrompt": request.negativePrompt or "", + "width": video.width, + "height": video.height, + "numFrames": video.frameCount, + "fps": video.fps, + "steps": request.steps, + "guidance": request.guidance, + "seed": video.seed, + "createdAt": created_at, + "durationSeconds": video.durationSeconds, + "clipDurationSeconds": clip_duration, + "videoBytes": video.bytes, + "videoMimeType": video.mimeType, + "videoExtension": video.extension, + "runtimeLabel": video.runtimeLabel, + "runtimeNote": video.runtimeNote, + } + return _save_video_artifact(artifact), runtime_status + + def create_app( state: ChaosEngineState | None = None, api_token: str | None = None, @@ -300,6 +453,13 @@ def create_app( app.state.chaosengine = state or ChaosEngineState(server_port=DEFAULT_PORT) app.state.chaosengine_api_token = _resolve_api_token(api_token) app.state.chaosengine_allowed_origins = frozenset(allowed_origins) + # Bearer-token enforcement toggle. Reads from (in order) env override, + # then saved settings, defaulting to True (keep the existing secure + # default). Mutated live by state.update_settings so the user doesn't + # need to restart the server to toggle it. + app.state.chaosengine_require_api_auth = _resolve_require_api_auth( + app.state.chaosengine.settings, + ) # Shutdown hook: kill any running llama-server / MLX worker children # on backend exit. Runs on clean shutdown (uvicorn SIGTERM), Ctrl-C, @@ -360,6 +520,7 @@ async def require_api_auth(request: Request, call_next): request.method == "OPTIONS" or path in EXEMPT_AUTH_PATHS or not (path.startswith("/api/") or path.startswith("/v1/")) + or not getattr(app.state, "chaosengine_require_api_auth", True) ): return await call_next(request) @@ -399,6 +560,12 @@ async def log_requests(request, call_next): return response register_routes(app) + + # Kick off a background torch import so the first Video Studio probe + # doesn't pay the 30-60s cold-disk cost on Windows. Failures are captured + # and surfaced by probe() itself. + start_torch_warmup() + return app diff --git a/backend_service/catalog/__init__.py b/backend_service/catalog/__init__.py index e2e4740..39e7736 100644 --- a/backend_service/catalog/__init__.py +++ b/backend_service/catalog/__init__.py @@ -1,2 +1,3 @@ from .text_models import MODEL_FAMILIES as MODEL_FAMILIES, CATALOG as CATALOG from .image_models import IMAGE_MODEL_FAMILIES as IMAGE_MODEL_FAMILIES, LATEST_IMAGE_TRACKED_SEEDS as LATEST_IMAGE_TRACKED_SEEDS +from .video_models import VIDEO_MODEL_FAMILIES as VIDEO_MODEL_FAMILIES diff --git a/backend_service/catalog/image_models.py b/backend_service/catalog/image_models.py index 6951e83..3c926e9 100644 --- a/backend_service/catalog/image_models.py +++ b/backend_service/catalog/image_models.py @@ -28,6 +28,7 @@ "recommendedResolution": "1024x1024", "note": "Fastest concepting option in the curated image catalog.", "estimatedGenerationSeconds": 4.2, + "releaseDate": "2024-08", } ], }, @@ -55,6 +56,7 @@ "recommendedResolution": "1024x1024", "note": "Quality-oriented generalist for the curated image lineup.", "estimatedGenerationSeconds": 7.4, + "releaseDate": "2024-08", } ], }, @@ -82,6 +84,7 @@ "recommendedResolution": "1024x1024", "note": "Latest Stability offering targeting a good quality-to-resource balance.", "estimatedGenerationSeconds": 5.8, + "releaseDate": "2024-10", } ], }, @@ -109,6 +112,7 @@ "recommendedResolution": "1024x1024", "note": "Fastest high-fidelity model in the curated set.", "estimatedGenerationSeconds": 3.1, + "releaseDate": "2024-10", } ], }, @@ -136,6 +140,7 @@ "recommendedResolution": "1024x1024", "note": "Widely adopted SDXL baseline with strong community and LoRA ecosystem.", "estimatedGenerationSeconds": 7.4, + "releaseDate": "2023-07", } ], }, @@ -154,6 +159,7 @@ "gated": False, "pipelineTag": "text-to-image", "updatedLabel": "Tracked latest", + "releaseDate": "2025-08", }, { "repo": "Qwen/Qwen-Image-Edit", @@ -167,6 +173,7 @@ "gated": False, "pipelineTag": "image-to-image", "updatedLabel": "Tracked latest", + "releaseDate": "2025-08", }, { "repo": "HiDream-ai/HiDream-I1-Full", @@ -180,6 +187,7 @@ "gated": False, "pipelineTag": "text-to-image", "updatedLabel": "Tracked latest", + "releaseDate": "2025-04", }, { "repo": "zai-org/GLM-Image", @@ -206,6 +214,7 @@ "gated": False, "pipelineTag": "text-to-image", "updatedLabel": "Tracked latest", + "releaseDate": "2025-03", }, { "repo": "Efficient-Large-Model/Sana_Sprint_1.6B_1024px_diffusers", @@ -219,5 +228,6 @@ "gated": False, "pipelineTag": "text-to-image", "updatedLabel": "Tracked latest", + "releaseDate": "2025-03", }, ] diff --git a/backend_service/catalog/text_models.py b/backend_service/catalog/text_models.py index 68979d1..f350348 100644 --- a/backend_service/catalog/text_models.py +++ b/backend_service/catalog/text_models.py @@ -302,6 +302,7 @@ "contextWindow": "128K", "launchMode": "direct", "backend": "llama.cpp", + "releaseDate": "2025-07", }, { "id": "mistralai/Devstral-Small-2507", @@ -317,6 +318,7 @@ "contextWindow": "128K", "launchMode": "convert", "backend": "mlx", + "releaseDate": "2025-07", }, ], "readme": [ @@ -410,6 +412,7 @@ "contextWindow": "128K", "launchMode": "convert", "backend": "mlx", + "releaseDate": "2024-09", }, { "id": "Qwen/Qwen2.5-7B-Instruct", @@ -425,6 +428,7 @@ "contextWindow": "128K", "launchMode": "convert", "backend": "mlx", + "releaseDate": "2024-09", }, { "id": "Qwen/Qwen2.5-Coder-7B-Instruct", @@ -440,6 +444,7 @@ "contextWindow": "128K", "launchMode": "convert", "backend": "mlx", + "releaseDate": "2024-11", }, { "id": "Qwen/Qwen2.5-32B-Instruct", @@ -455,6 +460,7 @@ "contextWindow": "128K", "launchMode": "convert", "backend": "mlx", + "releaseDate": "2024-09", }, ], "readme": [ @@ -493,6 +499,7 @@ "contextWindow": "128K", "launchMode": "convert", "backend": "mlx", + "releaseDate": "2024-12", }, { "id": "mlx-community/Llama-3.3-70B-Instruct-4bit", @@ -508,6 +515,7 @@ "contextWindow": "128K", "launchMode": "direct", "backend": "mlx", + "releaseDate": "2024-12", }, ], "readme": [ @@ -546,6 +554,7 @@ "contextWindow": "16K", "launchMode": "convert", "backend": "mlx", + "releaseDate": "2024-12", }, { "id": "mlx-community/phi-4-4bit", @@ -561,6 +570,7 @@ "contextWindow": "16K", "launchMode": "direct", "backend": "mlx", + "releaseDate": "2025-01", }, ], "readme": [ @@ -599,6 +609,7 @@ "contextWindow": "128K", "launchMode": "convert", "backend": "mlx", + "releaseDate": "2024-07", }, ], "readme": [ diff --git a/backend_service/catalog/video_models.py b/backend_service/catalog/video_models.py new file mode 100644 index 0000000..33a8e7c --- /dev/null +++ b/backend_service/catalog/video_models.py @@ -0,0 +1,257 @@ +"""Curated catalog of video generation models we plan to support. + +This module mirrors the shape of ``image_models.py`` so the frontend can reuse +the same UI patterns (families -> variants, downloads, discover tab). + +Only the first-wave candidate engines live here today. The runtime is not +wired yet — see ``backend_service/routes/video.py`` for the API surface and +``VideoPlaceholderTab`` on the frontend for the current UX. +""" + +from __future__ import annotations + +from typing import Any + + +VIDEO_MODEL_FAMILIES: list[dict[str, Any]] = [ + { + "id": "ltx-video", + "name": "LTX-Video", + "provider": "Lightricks", + "headline": "Fast text-to-video model tuned for consumer hardware.", + "summary": "First target for local video generation. Short clips (2-5s) at 768x512 with solid motion quality.", + "updatedLabel": "Planned — first wave", + "badges": ["Fast", "Small", "Apache 2.0"], + "defaultVariantId": "Lightricks/LTX-Video", + "variants": [ + { + "id": "Lightricks/LTX-Video", + "familyId": "ltx-video", + "name": "LTX-Video", + "provider": "Lightricks", + "repo": "Lightricks/LTX-Video", + "link": "https://huggingface.co/Lightricks/LTX-Video", + "runtime": "diffusers LTXPipeline (planned)", + "styleTags": ["general", "fast", "motion"], + "taskSupport": ["txt2video"], + "sizeGb": 2.0, + "recommendedResolution": "768x512", + "defaultDurationSeconds": 4.0, + "note": "Small, fast, Apache 2.0 — best starter pick for a local video runtime.", + "estimatedGenerationSeconds": 45.0, + "availableLocally": False, + "releaseDate": "2024-11", + } + ], + }, + { + "id": "wan-2-1", + "name": "Wan 2.1", + "provider": "Alibaba", + "headline": "Smaller Wan variants — the 1.3B is the fastest starter pick for local video.", + "summary": "Wan 2.1 ships in a 1.3B size that fits on modest hardware and a 14B size for higher quality. Both use the same WanPipeline in diffusers.", + "updatedLabel": "Planned — first wave", + "badges": ["Small", "Fast", "Apache 2.0"], + "defaultVariantId": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", + "variants": [ + { + "id": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", + "familyId": "wan-2-1", + "name": "Wan 2.1 T2V 1.3B", + "provider": "Alibaba", + # The -Diffusers mirror ships the standard diffusers layout + # (model_index.json, scheduler/, text_encoder/, transformer/, + # vae/, tokenizer/) — the base Wan-AI repo uses a native Wan + # format that WanPipeline.from_pretrained can't load. + "repo": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", + "link": "https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers", + "runtime": "diffusers WanPipeline", + "styleTags": ["general", "fast", "small"], + "taskSupport": ["txt2video"], + # ~16GB on disk — 1.3B is just the transformer. The repo also + # ships a UMT5-XXL text encoder (~11GB) and VAE/CLIP weights. + "sizeGb": 16.4, + "recommendedResolution": "832x480", + "defaultDurationSeconds": 4.0, + "note": "1.3B transformer + UMT5 text encoder. ~16GB on disk. Best starter pick for trying local video end-to-end on modest hardware.", + "estimatedGenerationSeconds": 60.0, + "availableLocally": False, + "releaseDate": "2025-02", + }, + { + "id": "Wan-AI/Wan2.1-T2V-14B-Diffusers", + "familyId": "wan-2-1", + "name": "Wan 2.1 T2V 14B", + "provider": "Alibaba", + "repo": "Wan-AI/Wan2.1-T2V-14B-Diffusers", + "link": "https://huggingface.co/Wan-AI/Wan2.1-T2V-14B-Diffusers", + "runtime": "diffusers WanPipeline", + "styleTags": ["general", "quality", "motion"], + "taskSupport": ["txt2video"], + # 14B transformer in bf16 (~28GB) + UMT5-XXL text encoder (~11GB) + # + VAE/CLIP weights. + "sizeGb": 45.0, + "recommendedResolution": "832x480", + "defaultDurationSeconds": 5.0, + "note": "Wan 2.1 quality tier. ~45GB. Same WanPipeline class as the 1.3B and Wan 2.2.", + "estimatedGenerationSeconds": 180.0, + "availableLocally": False, + "releaseDate": "2025-02", + }, + ], + }, + { + "id": "wan-2-2", + "name": "Wan 2.2", + "provider": "Alibaba", + "headline": "Strong text-to-video quality with competitive motion consistency.", + "summary": "Mid-sized Wan model that runs on 24GB+ VRAM or Apple Silicon with unified memory.", + "updatedLabel": "Planned — first wave", + "badges": ["Balanced", "Quality", "Apache 2.0"], + "defaultVariantId": "Wan-AI/Wan2.2-T2V-A14B-Diffusers", + "variants": [ + { + "id": "Wan-AI/Wan2.2-T2V-A14B-Diffusers", + "familyId": "wan-2-2", + "name": "Wan 2.2 T2V A14B", + "provider": "Alibaba", + # -Diffusers mirror ships the standard diffusers layout; the + # base Wan-AI/Wan2.2-T2V-A14B repo uses the native Wan format. + "repo": "Wan-AI/Wan2.2-T2V-A14B-Diffusers", + "link": "https://huggingface.co/Wan-AI/Wan2.2-T2V-A14B-Diffusers", + "runtime": "diffusers WanPipeline", + "styleTags": ["general", "quality", "motion"], + "taskSupport": ["txt2video"], + "sizeGb": 14.0, + "recommendedResolution": "832x480", + "defaultDurationSeconds": 5.0, + "note": "Balanced quality vs size. Works on 24GB VRAM or 64GB unified memory.", + "estimatedGenerationSeconds": 180.0, + "availableLocally": False, + "releaseDate": "2025-07", + } + ], + }, + { + "id": "hunyuan-video", + "name": "HunyuanVideo", + "provider": "Tencent", + "headline": "High-fidelity text-to-video with longer clips and stronger scene cohesion.", + "summary": "Heavy-duty model that needs 40GB+ class hardware. Ships longer clips and nicer compositions.", + "updatedLabel": "Planned — stretch target", + "badges": ["Quality", "Heavy", "Apache 2.0"], + "defaultVariantId": "hunyuanvideo-community/HunyuanVideo", + "variants": [ + { + "id": "hunyuanvideo-community/HunyuanVideo", + "familyId": "hunyuan-video", + "name": "HunyuanVideo", + "provider": "Tencent", + # Community-maintained diffusers port of tencent/HunyuanVideo. + # The base tencent repo doesn't ship model_index.json — the + # -community mirror is the one HunyuanVideoPipeline loads. + "repo": "hunyuanvideo-community/HunyuanVideo", + "link": "https://huggingface.co/hunyuanvideo-community/HunyuanVideo", + "runtime": "diffusers HunyuanVideoPipeline", + "styleTags": ["general", "quality", "cinematic"], + "taskSupport": ["txt2video"], + "sizeGb": 25.0, + "recommendedResolution": "1280x720", + "defaultDurationSeconds": 5.0, + "note": "High quality. Needs 40GB+ VRAM or Apple Silicon Max/Ultra class memory.", + "estimatedGenerationSeconds": 420.0, + "availableLocally": False, + "releaseDate": "2024-12", + } + ], + }, + { + "id": "mochi-1", + "name": "Mochi 1", + "provider": "Genmo", + "headline": "Open-weight video model with competitive motion quality.", + "summary": "Apache 2.0 licence, solid motion handling, mid-sized footprint.", + "updatedLabel": "Planned — first wave", + "badges": ["Open", "Balanced", "Apache 2.0"], + "defaultVariantId": "genmo/mochi-1-preview", + "variants": [ + { + "id": "genmo/mochi-1-preview", + "familyId": "mochi-1", + "name": "Mochi 1 Preview", + "provider": "Genmo", + "repo": "genmo/mochi-1-preview", + "link": "https://huggingface.co/genmo/mochi-1-preview", + "runtime": "diffusers MochiPipeline (planned)", + "styleTags": ["general", "motion", "balanced"], + "taskSupport": ["txt2video"], + "sizeGb": 10.0, + "recommendedResolution": "848x480", + "defaultDurationSeconds": 5.4, + "note": "Apache 2.0, balanced footprint, strong motion quality.", + "estimatedGenerationSeconds": 150.0, + "availableLocally": False, + "releaseDate": "2024-10", + } + ], + }, + { + "id": "cogvideox", + "name": "CogVideoX", + "provider": "THUDM", + "headline": "Tsinghua's open-weight video model — 2B fits 8 GB VRAM, 5B is the quality tier.", + "summary": ( + "CogVideoX ships in a 2B size that runs on 8 GB consumer GPUs and a 5B size that " + "delivers higher fidelity on 24 GB+ cards or unified-memory Macs. Both use the same " + "CogVideoXPipeline in diffusers." + ), + "updatedLabel": "Planned — first wave", + "badges": ["Small", "Open", "Apache 2.0"], + "defaultVariantId": "THUDM/CogVideoX-2b", + "variants": [ + { + "id": "THUDM/CogVideoX-2b", + "familyId": "cogvideox", + "name": "CogVideoX 2B", + "provider": "THUDM", + "repo": "THUDM/CogVideoX-2b", + "link": "https://huggingface.co/THUDM/CogVideoX-2b", + "runtime": "diffusers CogVideoXPipeline", + "styleTags": ["general", "fast", "small"], + "taskSupport": ["txt2video"], + # 2B transformer in fp16 (~4 GB) + T5 text encoder (~5 GB) + + # VAE. Fits comfortably on a 12 GB card; 8 GB works with + # CPU-offload tricks. Smaller than Wan 2.1 1.3B because there's + # no UMT5-XXL — just the standard T5. + "sizeGb": 9.0, + "recommendedResolution": "720x480", + "defaultDurationSeconds": 6.0, + "note": "Smallest CogVideoX. Apache 2.0 weights, ~9 GB on disk, runs on consumer GPUs.", + "estimatedGenerationSeconds": 90.0, + "availableLocally": False, + "releaseDate": "2024-08", + }, + { + "id": "THUDM/CogVideoX-5b", + "familyId": "cogvideox", + "name": "CogVideoX 5B", + "provider": "THUDM", + "repo": "THUDM/CogVideoX-5b", + "link": "https://huggingface.co/THUDM/CogVideoX-5b", + "runtime": "diffusers CogVideoXPipeline", + "styleTags": ["general", "quality", "balanced"], + "taskSupport": ["txt2video"], + # 5B transformer (~10 GB) + T5 (~5 GB) + VAE. Lands in the + # same envelope as Wan 2.2 — needs 24 GB VRAM or 32 GB+ + # unified memory. + "sizeGb": 18.0, + "recommendedResolution": "720x480", + "defaultDurationSeconds": 6.0, + "note": "Quality tier. ~18 GB on disk. Same CogVideoXPipeline class as the 2B.", + "estimatedGenerationSeconds": 200.0, + "availableLocally": False, + "releaseDate": "2024-08", + }, + ], + }, +] diff --git a/backend_service/helpers/cache.py b/backend_service/helpers/cache.py index 7a3e75e..cc3069e 100644 --- a/backend_service/helpers/cache.py +++ b/backend_service/helpers/cache.py @@ -47,7 +47,7 @@ def compute_cache_preview( strategy: str = "native", build_system_snapshot=None, ) -> dict[str, Any]: - from compression import registry as _cache_registry + from cache_compression import registry as _cache_registry num_layers = max(1, num_layers) num_heads = max(1, num_heads) diff --git a/backend_service/helpers/discovery.py b/backend_service/helpers/discovery.py index 55a6a63..6847d96 100644 --- a/backend_service/helpers/discovery.py +++ b/backend_service/helpers/discovery.py @@ -329,6 +329,27 @@ def _detect_model_quantization(path: Path, fmt: str, *, name_hint: str = "") -> ) +# Video diffusion pipelines. Keep keywords specific enough that they don't +# collide with chat LLMs or image diffusion checkpoints — e.g. "hunyuanvideo" +# not "hunyuan" (which would catch the Hunyuan image model), "wan2" not "wan" +# (too generic), "mochi-1" not "mochi". New video families added to +# ``backend_service/catalog/video_models.py`` should also get a keyword here. +_VIDEO_MODEL_KEYWORDS = ( + "hunyuanvideo", + "wan-ai/", + "wan2.", + "wan2-", + "-t2v-", + "-i2v-", + "-v2v-", + "mochi-1", + "cogvideo", + "ltx-video", + "zeroscope", + "animatediff", +) + + def _looks_like_draft_model(name: str) -> bool: """Return True if this looks like a speculative decoding draft model. @@ -339,6 +360,24 @@ def _looks_like_draft_model(name: str) -> bool: return any(kw in lower for kw in _DRAFT_MODEL_KEYWORDS) +def _looks_like_video_model(name: str) -> bool: + """Return True if this looks like a video diffusion pipeline. + + Video models (LTX-Video, Wan, HunyuanVideo, Mochi, CogVideo, …) are + Diffusers pipelines with much larger VRAM footprints than LLMs and + their own dedicated Studio/Discover UI under the Video section. They + should be excluded from the chat-oriented My Models list. + + Detection is keyword-only here because video Diffusers pipelines share + the ``model_index.json`` marker with image pipelines — we can't use that + to discriminate. When a partial HF cache download hasn't yet produced + ``model_index.json``, the name-based match is what keeps them out of + the LLM list. + """ + lower = name.lower() + return any(kw in lower for kw in _VIDEO_MODEL_KEYWORDS) + + def _looks_like_image_model(path: Path, name: str) -> bool: """Return True if this looks like a diffusion / image generation model.""" lower_name = name.lower() @@ -623,7 +662,9 @@ def _discover_local_models(model_directories: list[dict[str, Any]], limit: int = broken, broken_reason = _detect_broken_library_item(child, file_format, source_kind) quantization = _detect_model_quantization(child, file_format, name_hint=name) backend = "llama.cpp" if file_format == "GGUF" else "mlx" - if _looks_like_image_model(child, name): + if _looks_like_video_model(name): + model_type = "video" + elif _looks_like_image_model(child, name): model_type = "image" elif _looks_like_draft_model(name): model_type = "draft" diff --git a/backend_service/helpers/formatting.py b/backend_service/helpers/formatting.py index 71a11da..d37b60c 100644 --- a/backend_service/helpers/formatting.py +++ b/backend_service/helpers/formatting.py @@ -51,7 +51,7 @@ def _parse_context_label(label: str | None) -> int | None: def _benchmark_label(model_name: str, *, cache_strategy: str, bits: int, fp16_layers: int, context_tokens: int) -> str: - from compression import registry as _strategy_registry + from cache_compression import registry as _strategy_registry strat = _strategy_registry.get(cache_strategy) or _strategy_registry.default() cache_label = strat.label(bits, fp16_layers) return f"{model_name} / {cache_label} / {_context_label(context_tokens)} ctx" diff --git a/backend_service/helpers/gpu.py b/backend_service/helpers/gpu.py index 90f933b..2c4e84a 100644 --- a/backend_service/helpers/gpu.py +++ b/backend_service/helpers/gpu.py @@ -8,11 +8,23 @@ from __future__ import annotations import platform +import shutil import subprocess import json +import threading from typing import Any +# Windows: prevent every nvidia-smi / sysctl invocation from flashing a +# console window. Without this, FastAPI worker threads on Windows pop a +# brief cmd.exe window per probe — and on slower disks the spawn alone +# can add 1-2s of latency to ``/api/video/runtime``, blowing past the +# frontend's 15s fetch timeout and surfacing as "Failed to fetch". +_SUBPROCESS_KWARGS: dict[str, Any] = {} +if hasattr(subprocess, "CREATE_NO_WINDOW"): + _SUBPROCESS_KWARGS["creationflags"] = subprocess.CREATE_NO_WINDOW + + class GPUMonitor: """Cross-platform GPU/accelerator monitor.""" @@ -41,6 +53,7 @@ def _snapshot_macos(self) -> dict[str, Any]: chip = subprocess.check_output( ["sysctl", "-n", "machdep.cpu.brand_string"], text=True, timeout=5, + **_SUBPROCESS_KWARGS, ).strip() if chip: gpu_name = chip @@ -53,6 +66,7 @@ def _snapshot_macos(self) -> dict[str, Any]: total_bytes = int(subprocess.check_output( ["sysctl", "-n", "hw.memsize"], text=True, timeout=5, + **_SUBPROCESS_KWARGS, ).strip()) vram_total_gb = round(total_bytes / (1024 ** 3), 2) except Exception: @@ -72,6 +86,7 @@ def _snapshot_macos(self) -> dict[str, Any]: out = subprocess.check_output( ["ioreg", "-r", "-d", "1", "-c", "AppleARMIODevice"], text=True, timeout=5, + **_SUBPROCESS_KWARGS, ) # Best-effort — ioreg doesn't reliably expose GPU util on all chips except Exception: @@ -100,6 +115,7 @@ def _snapshot_nvidia(self) -> dict[str, Any]: ], text=True, timeout=10, + **_SUBPROCESS_KWARGS, ) parts = [p.strip() for p in out.strip().split(",")] if len(parts) >= 6: @@ -151,3 +167,119 @@ def _fallback_psutil(self) -> dict[str, Any]: def get_gpu_metrics() -> dict[str, Any]: """Return a snapshot of current GPU / accelerator metrics.""" return _monitor.snapshot() + + +# VRAM total never changes for the life of a process — caching it lets the +# video runtime probe stay snappy even when nvidia-smi takes a second or two +# to spawn on Windows. Cleared by ``reset_vram_total_cache()`` for tests. +_VRAM_TOTAL_LOCK = threading.Lock() +_VRAM_TOTAL_CACHE: dict[str, float | None] = {} + + +def get_device_vram_total_gb() -> float | None: + """Return total device memory in GB, cached for the process lifetime. + + Hot path for ``backend_service.video_runtime._detect_device_memory_gb``. + The full ``snapshot()`` call shells out to ``nvidia-smi``/``sysctl`` every + time, which is fine for the metrics endpoint (live readings) but wasteful + for the video runtime probe (which only needs total VRAM, and a value + that is fixed per machine). On Windows the subprocess startup cost was + blowing past the frontend's 15s fetch timeout under load. + """ + with _VRAM_TOTAL_LOCK: + if "value" in _VRAM_TOTAL_CACHE: + return _VRAM_TOTAL_CACHE["value"] + + try: + snapshot = _monitor.snapshot() + except Exception: + snapshot = {} + + total = snapshot.get("vram_total_gb") + value: float | None = float(total) if isinstance(total, (int, float)) and total > 0 else None + + with _VRAM_TOTAL_LOCK: + _VRAM_TOTAL_CACHE["value"] = value + return value + + +def reset_vram_total_cache() -> None: + """Clear the cached VRAM total. Used by tests.""" + with _VRAM_TOTAL_LOCK: + _VRAM_TOTAL_CACHE.clear() + + +def nvidia_gpu_present() -> bool: + """Cheap, side-effect-free check for an NVIDIA GPU on Linux/Windows. + + We only look for ``nvidia-smi`` on ``PATH`` — invoking it is deliberately + avoided because some locked-down laptops and WSL installs without the + driver shim hang on the first call. Presence on ``PATH`` is a + reliable-enough signal for the "you probably wanted CUDA" diagnostic the + image/video runtimes surface when torch falls back to CPU. + """ + return shutil.which("nvidia-smi") is not None + + +_CUDA_WHEEL_HINT = ( + "Click \"Install CUDA torch\" in this banner, or run: " + "pip install --upgrade --force-reinstall torch " + "--index-url https://download.pytorch.org/whl/cu124" +) + + +def gpu_status_snapshot() -> dict[str, Any]: + """Unified GPU status for the frontend warning banner. + + Returns a dict with the host platform, whether an NVIDIA driver is + visible, whether torch can reach CUDA / MPS, and a recommendation string + when torch falls back to CPU on a machine with an NVIDIA GPU. All fields + are optional so this can be called before torch has been imported without + failing. + """ + system = platform.system() + nvidia_present = nvidia_gpu_present() + + torch_imported = False + cuda_available = False + mps_available = False + try: + import torch # type: ignore + except Exception: + torch_module = None + else: + torch_module = torch + torch_imported = True + + if torch_module is not None: + try: + cuda_available = bool(getattr(torch_module.cuda, "is_available", lambda: False)()) + except Exception: + cuda_available = False + try: + mps_module = getattr(torch_module.backends, "mps", None) + if mps_module is not None: + mps_available = bool(getattr(mps_module, "is_available", lambda: False)()) + except Exception: + mps_available = False + + if system in ("Windows", "Linux") and nvidia_present and torch_imported and not cuda_available: + recommendation = ( + "torch was imported but CUDA is unavailable — generation will run on CPU " + "(expect minutes per step). Reinstall the CUDA wheel: " + + _CUDA_WHEEL_HINT + ) + warn = True + else: + recommendation = None + warn = False + + return { + "platform": system, + "nvidiaGpuDetected": nvidia_present, + "torchImported": torch_imported, + "torchCudaAvailable": cuda_available, + "torchMpsAvailable": mps_available, + "cpuFallbackWarning": warn, + "recommendation": recommendation, + } diff --git a/backend_service/helpers/huggingface.py b/backend_service/helpers/huggingface.py index d22436a..371424f 100644 --- a/backend_service/helpers/huggingface.py +++ b/backend_service/helpers/huggingface.py @@ -11,7 +11,7 @@ from pathlib import Path from typing import Any -from backend_service.catalog import MODEL_FAMILIES, IMAGE_MODEL_FAMILIES +from backend_service.catalog import MODEL_FAMILIES, IMAGE_MODEL_FAMILIES, VIDEO_MODEL_FAMILIES from backend_service.helpers.formatting import _bytes_to_gb from backend_service.helpers.discovery import _path_size_bytes @@ -161,6 +161,7 @@ def _search_huggingface_hub(query: str, library: list[dict[str, Any]], limit: in downloads = model.get("downloads") or 0 likes = model.get("likes") or 0 last_modified = str(model.get("lastModified") or "").strip() or None + created_at = str(model.get("createdAt") or "").strip() or None results.append({ "id": model_id, @@ -176,6 +177,8 @@ def _search_huggingface_hub(query: str, library: list[dict[str, Any]], limit: in "likesLabel": f"{likes:,} likes", "lastModified": last_modified, "updatedLabel": _format_hf_updated_label(last_modified), + "createdAt": created_at, + "releaseLabel": _format_release_label(created_at), "availableLocally": available_locally, "launchMode": launch_mode, "backend": backend, @@ -416,6 +419,32 @@ def _format_hf_updated_label(value: str | None) -> str | None: return f"Updated {month_label} {parsed.day}, {parsed.year}" +def _format_release_label(value: str | None) -> str | None: + """Format a release date / HF ``createdAt`` into a short human label. + + Accepts either a full ISO datetime (``2024-08-01T12:34:56Z`` — HF API) + or a year-month shorthand (``2024-08`` — curated catalog entries) and + returns ``"Released Aug 2024"``. Falls back to None when the input + can't be parsed. + """ + if not value: + return None + parsed = _parse_iso_datetime(value) + if parsed is None: + # Try ``YYYY-MM`` or ``YYYY-MM-DD`` shorthand used in curated catalog + # entries — ``_parse_iso_datetime`` only handles the full datetime form. + text = str(value).strip() + for fmt in ("%Y-%m-%d", "%Y-%m", "%Y"): + try: + parsed = datetime.strptime(text, fmt).replace(tzinfo=timezone.utc) + break + except ValueError: + continue + if parsed is None: + return None + return f"Released {parsed.strftime('%b')} {parsed.year}" + + def _hf_number_label(value: int, noun: str) -> str: return f"{value:,} {noun}" @@ -508,6 +537,17 @@ def _known_repo_size_gb(repo_id: str) -> float | None: if size_gb > 0: return size_gb + for family in VIDEO_MODEL_FAMILIES: + for variant in family["variants"]: + if str(variant.get("repo") or "") != repo_id: + continue + try: + size_gb = float(variant.get("sizeGb") or 0) + except (TypeError, ValueError): + size_gb = 0.0 + if size_gb > 0: + return size_gb + return None diff --git a/backend_service/helpers/images.py b/backend_service/helpers/images.py index 73b8d9d..169be5c 100644 --- a/backend_service/helpers/images.py +++ b/backend_service/helpers/images.py @@ -18,11 +18,12 @@ from backend_service.helpers.huggingface import ( _classify_hub_file, _format_hf_updated_label, + _format_release_label, _hf_number_label, _hf_repo_snapshot_dir, _parse_iso_datetime, ) -from backend_service.helpers.discovery import _candidate_model_dirs +from backend_service.helpers.discovery import _candidate_model_dirs, _path_size_bytes from backend_service.image_runtime import validate_local_diffusers_snapshot @@ -31,6 +32,33 @@ _LATEST_IMAGE_MODELS_CACHE: tuple[float, list[dict[str, Any]]] | None = None _LATEST_IMAGE_MODELS_TTL_SECONDS = 3 * 60 * 60 +# Cache keyed by (path, mtime_ns) — we recompute only when the snapshot dir +# actually changes. A fresh os.stat() is cheap enough to do per payload call. +_SNAPSHOT_SIZE_CACHE: dict[tuple[str, int], int] = {} + + +def _snapshot_on_disk_bytes(snapshot_dir: Path | None) -> int | None: + """Walk the HF snapshot dir and return its true on-disk byte size. + + Delegates to ``_path_size_bytes`` which dedupes by inode, so HF's + ``snapshots// -> blobs/`` symlink farm counts each blob + exactly once. Returns ``None`` when the path is missing or empty so + callers can distinguish "not on disk" from "zero bytes". + """ + if snapshot_dir is None: + return None + try: + stat_result = snapshot_dir.stat() + except OSError: + return None + cache_key = (str(snapshot_dir), stat_result.st_mtime_ns) + cached = _SNAPSHOT_SIZE_CACHE.get(cache_key) + if cached is not None: + return cached or None + total = _path_size_bytes(snapshot_dir) + _SNAPSHOT_SIZE_CACHE[cache_key] = total + return total or None + def _stable_image_hash(value: str) -> int: acc = 0 @@ -94,17 +122,30 @@ def _image_model_payloads(library: list[dict[str, Any]]) -> list[dict[str, Any]] families: list[dict[str, Any]] = [] for family in IMAGE_MODEL_FAMILIES: - variants = [ - { - **variant, - **repo_metadata.get(str(variant.get("repo") or ""), {}), - "source": "curated", - "familyName": family.get("name"), - "availableLocally": _image_variant_available_locally(variant, library), - "hasLocalData": _hf_repo_snapshot_dir(str(variant.get("repo") or "")) is not None, - } - for variant in family["variants"] - ] + variants = [] + for variant in family["variants"]: + repo_id = str(variant.get("repo") or "") + snapshot_dir = _hf_repo_snapshot_dir(repo_id) if repo_id else None + live_metadata = repo_metadata.get(repo_id, {}) + curated_release_date = str(variant.get("releaseDate") or "").strip() or None + curated_release_label = _format_release_label(curated_release_date) + release_label = curated_release_label or live_metadata.get("releaseLabel") + on_disk_bytes = _snapshot_on_disk_bytes(snapshot_dir) + variants.append( + { + **variant, + **live_metadata, + "source": "curated", + "familyName": family.get("name"), + "availableLocally": _image_variant_available_locally(variant, library), + "hasLocalData": snapshot_dir is not None, + "localPath": str(snapshot_dir) if snapshot_dir else None, + "releaseDate": curated_release_date, + "releaseLabel": release_label, + "onDiskBytes": on_disk_bytes, + "onDiskGb": _bytes_to_gb(on_disk_bytes) if on_disk_bytes else None, + } + ) families.append( { **family, @@ -202,6 +243,7 @@ def _image_repo_live_metadata(repo_id: str) -> dict[str, Any]: downloads = int(data.get("downloads") or 0) likes = int(data.get("likes") or 0) last_modified = str(data.get("lastModified") or "").strip() or None + created_at = str(data.get("createdAt") or "").strip() or None payload = { "downloads": downloads, "likes": likes, @@ -209,6 +251,8 @@ def _image_repo_live_metadata(repo_id: str) -> dict[str, Any]: "likesLabel": _hf_number_label(likes, "likes") if likes > 0 else None, "lastModified": last_modified, "updatedLabel": _format_hf_updated_label(last_modified), + "createdAt": created_at, + "releaseLabel": _format_release_label(created_at), "license": license_value, "gated": bool(data.get("gated")), "pipelineTag": str(data.get("pipeline_tag") or "").strip() or None, @@ -318,6 +362,9 @@ def _tracked_latest_seed_payloads(library: list[dict[str, Any]]) -> list[dict[st repo_id = str(seed.get("repo") or "") if not repo_id: continue + release_date = str(seed.get("releaseDate") or "").strip() or None + snapshot_dir = _hf_repo_snapshot_dir(repo_id) + on_disk_bytes = _snapshot_on_disk_bytes(snapshot_dir) payloads.append( { "id": repo_id, @@ -337,7 +384,10 @@ def _tracked_latest_seed_payloads(library: list[dict[str, Any]]) -> list[dict[st or "Tracked latest image repo surfaced by ChaosEngineAI when the live latest lane is sparse." ), "availableLocally": _image_repo_runtime_ready(repo_id), - "hasLocalData": _hf_repo_snapshot_dir(repo_id) is not None, + "hasLocalData": snapshot_dir is not None, + "localPath": str(snapshot_dir) if snapshot_dir else None, + "onDiskBytes": on_disk_bytes, + "onDiskGb": _bytes_to_gb(on_disk_bytes) if on_disk_bytes else None, "estimatedGenerationSeconds": None, "downloads": None, "likes": None, @@ -345,6 +395,9 @@ def _tracked_latest_seed_payloads(library: list[dict[str, Any]]) -> list[dict[st "likesLabel": None, "lastModified": None, "updatedLabel": str(seed.get("updatedLabel") or "Tracked latest"), + "createdAt": None, + "releaseDate": release_date, + "releaseLabel": _format_release_label(release_date), "license": seed.get("license"), "gated": seed.get("gated"), "pipelineTag": seed.get("pipelineTag"), @@ -458,6 +511,8 @@ def _latest_image_model_payloads(library: list[dict[str, Any]], limit: int = 10) tags = [str(tag) for tag in (model.get("tags") or [])] pipeline_tag = str(model.get("pipeline_tag") or "").strip() or None metadata = _image_repo_live_metadata(model_id) + snapshot_dir = _hf_repo_snapshot_dir(model_id) + on_disk_bytes = _snapshot_on_disk_bytes(snapshot_dir) candidates.append({ "id": model_id, "familyId": "latest", @@ -476,7 +531,10 @@ def _latest_image_model_payloads(library: list[dict[str, Any]], limit: int = 10) "Review details on Hugging Face before treating it as a fully curated Studio default." ), "availableLocally": _image_repo_runtime_ready(model_id), - "hasLocalData": _hf_repo_snapshot_dir(model_id) is not None, + "hasLocalData": snapshot_dir is not None, + "localPath": str(snapshot_dir) if snapshot_dir else None, + "onDiskBytes": on_disk_bytes, + "onDiskGb": _bytes_to_gb(on_disk_bytes) if on_disk_bytes else None, "estimatedGenerationSeconds": None, "downloads": metadata.get("downloads"), "likes": metadata.get("likes"), @@ -484,6 +542,8 @@ def _latest_image_model_payloads(library: list[dict[str, Any]], limit: int = 10) "likesLabel": metadata.get("likesLabel"), "lastModified": metadata.get("lastModified"), "updatedLabel": metadata.get("updatedLabel"), + "createdAt": metadata.get("createdAt"), + "releaseLabel": metadata.get("releaseLabel"), "license": metadata.get("license"), "gated": bool(metadata.get("gated")) if metadata.get("gated") is not None else None, "pipelineTag": metadata.get("pipelineTag") or pipeline_tag, @@ -608,6 +668,50 @@ def _image_download_repo_ids() -> set[str]: return repos +# Diffusers image pipelines (FLUX, SD3.5, SDXL, Sana, HiDream, Qwen-Image, ...) +# always load from the per-component folder layout at the snapshot root. Many +# repos also ship a legacy single-file checkpoint (e.g. ``flux1-schnell.safetensors`` +# in ``black-forest-labs/FLUX.1-schnell``) for ComfyUI/kijai users — ~24 GB of +# duplicate weights the diffusers pipeline never touches. Without an allowlist +# ``snapshot_download`` pulls both copies, so a 23 GB model lands on disk as +# 57+ GB. Mirrors ``_VIDEO_DIFFUSERS_ALLOW_PATTERNS`` in ``helpers/video.py``. +_IMAGE_DIFFUSERS_ALLOW_PATTERNS: list[str] = [ + "model_index.json", + "scheduler/**", + "text_encoder/**", + "text_encoder_2/**", + "text_encoder_3/**", + "tokenizer/**", + "tokenizer_2/**", + "tokenizer_3/**", + "transformer/**", + "transformer_2/**", + "unet/**", + "vae/**", + "feature_extractor/**", + "image_encoder/**", + "safety_checker/**", + "*.md", + "LICENSE*", +] + + +def _image_repo_allow_patterns(repo_id: str) -> list[str] | None: + """Patterns to pass to ``snapshot_download`` for an image repo. + + Returns ``None`` for repos that aren't known curated or tracked image + models so arbitrary Discover hub results still download in full. Returning + ``None`` (not an empty list) signals the caller to omit ``allow_patterns`` + entirely — an empty list would match nothing and download zero files. + """ + if not repo_id: + return None + known = _image_download_repo_ids() + if repo_id not in known: + return None + return list(_IMAGE_DIFFUSERS_ALLOW_PATTERNS) + + # ---- Image output CRUD ---- def _image_output_directory(image_outputs_dir: Path, created_at: str | None = None) -> Path: diff --git a/backend_service/helpers/settings.py b/backend_service/helpers/settings.py index 23d6c06..b71b087 100644 --- a/backend_service/helpers/settings.py +++ b/backend_service/helpers/settings.py @@ -181,6 +181,14 @@ def images_dir(self) -> Path: def image_outputs_dir(self) -> Path: return self.images_dir / "outputs" + @property + def videos_dir(self) -> Path: + return self.data_dir / "videos" + + @property + def video_outputs_dir(self) -> Path: + return self.videos_dir / "outputs" + def _normalize_slug(value: str, fallback: str) -> str: cleaned = "".join(character.lower() if character.isalnum() else "-" for character in value.strip()) @@ -193,11 +201,20 @@ def _default_settings(default_port: int, data_dir: Path) -> dict[str, Any]: "modelDirectories": [dict(entry) for entry in DEFAULT_MODEL_DIRECTORIES], "preferredServerPort": default_port, "allowRemoteConnections": False, + # Default on — the API token is auto-generated and passed to the + # frontend via /api/auth/session, so the built-in UI works out of + # the box. Users who connect external clients (OpenWebUI, scripts, + # another desktop app) can flip this off from the Server tab. + "requireApiAuth": True, "autoStartServer": False, "launchPreferences": dict(DEFAULT_LAUNCH_PREFERENCES), "remoteProviders": [], "huggingFaceToken": "", "dataDirectory": str(data_dir), + # Empty string means "use the default under dataDirectory". Anything + # else is treated as an absolute (or ~-relative) override path. + "imageOutputsDirectory": "", + "videoOutputsDirectory": "", } @@ -301,6 +318,9 @@ def _load_settings(path: Path, default_port: int, data_dir: Path) -> dict[str, A settings["preferredServerPort"] = default_port settings["allowRemoteConnections"] = bool(payload.get("allowRemoteConnections", False)) + # Default True: if the key is missing from an older settings.json we + # preserve the secure default rather than silently opening the API. + settings["requireApiAuth"] = bool(payload.get("requireApiAuth", True)) settings["autoStartServer"] = bool(payload.get("autoStartServer", False)) settings["launchPreferences"] = _normalize_launch_preferences(payload.get("launchPreferences")) @@ -311,6 +331,12 @@ def _load_settings(path: Path, default_port: int, data_dir: Path) -> dict[str, A if hf_token: os.environ["HF_TOKEN"] = hf_token os.environ["HUGGING_FACE_HUB_TOKEN"] = hf_token + + for key in ("imageOutputsDirectory", "videoOutputsDirectory"): + raw = payload.get(key) + if isinstance(raw, str): + settings[key] = raw.strip() + return settings diff --git a/backend_service/helpers/system.py b/backend_service/helpers/system.py index 4e87732..3749e56 100644 --- a/backend_service/helpers/system.py +++ b/backend_service/helpers/system.py @@ -413,7 +413,7 @@ def _build_system_snapshot(app_version: str, app_started_at: float) -> dict[str, ) def _get_cache_strategies(): - from compression import registry + from cache_compression import registry return registry.available() def _get_dflash_info(): diff --git a/backend_service/helpers/video.py b/backend_service/helpers/video.py new file mode 100644 index 0000000..28b24f4 --- /dev/null +++ b/backend_service/helpers/video.py @@ -0,0 +1,307 @@ +"""Video model helpers: variant lookup, install detection, payload shaping, +output CRUD. + +Mirrors ``helpers/images.py`` so the routes for ``/api/video/*`` can drop in +alongside the image routes without a new mental model. +""" + +from __future__ import annotations + +import base64 +import json +from datetime import datetime +from pathlib import Path +from typing import Any + +from backend_service.catalog import VIDEO_MODEL_FAMILIES +from backend_service.helpers.formatting import _bytes_to_gb +from backend_service.helpers.huggingface import _format_release_label, _hf_repo_snapshot_dir +from backend_service.helpers.images import _snapshot_on_disk_bytes +from backend_service.image_runtime import validate_local_diffusers_snapshot + + +def _video_model_payloads(library: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Return the catalog families enriched with per-variant availability. + + We deliberately don't hit Hugging Face live metadata from here — the image + version does for download counts, etc. We can bolt that on later if the + discover UX needs it. For now each variant just knows whether its local + snapshot is ready to load. + """ + families: list[dict[str, Any]] = [] + for family in VIDEO_MODEL_FAMILIES: + variants: list[dict[str, Any]] = [] + for variant in family["variants"]: + enriched = dict(variant) + repo = str(enriched.get("repo") or "") + enriched["availableLocally"] = _video_repo_runtime_ready(repo) if repo else False + enriched["hasLocalData"] = enriched["availableLocally"] or _video_repo_has_any_local_data(repo) + enriched["familyName"] = family["name"] + release_date = str(enriched.get("releaseDate") or "").strip() or None + enriched["releaseDate"] = release_date + enriched["releaseLabel"] = _format_release_label(release_date) + # Absolute path to the HF snapshot, used by the Reveal File button. + # Only populated when there is actually something on disk so the + # UI can reliably hide the button otherwise. + snapshot_dir = _hf_repo_snapshot_dir(repo) if (enriched["hasLocalData"] and repo) else None + enriched["localPath"] = str(snapshot_dir) if snapshot_dir else None + on_disk_bytes = _snapshot_on_disk_bytes(snapshot_dir) + enriched["onDiskBytes"] = on_disk_bytes + enriched["onDiskGb"] = _bytes_to_gb(on_disk_bytes) if on_disk_bytes else None + variants.append(enriched) + payload = dict(family) + payload["variants"] = variants + families.append(payload) + return families + + +def _find_video_variant(model_id: str) -> dict[str, Any] | None: + for family in VIDEO_MODEL_FAMILIES: + for variant in family["variants"]: + if variant["id"] == model_id: + return variant + return None + + +def _find_video_variant_by_repo(repo: str) -> dict[str, Any] | None: + for family in VIDEO_MODEL_FAMILIES: + for variant in family["variants"]: + if variant["repo"] == repo: + return variant + return None + + +def _is_video_repo(repo_id: str) -> bool: + return any( + str(variant.get("repo") or "") == repo_id + for family in VIDEO_MODEL_FAMILIES + for variant in family["variants"] + ) + + +def _video_repo_runtime_ready(repo_id: str) -> bool: + """True if the local snapshot is complete enough to load via diffusers.""" + snapshot_dir = _hf_repo_snapshot_dir(repo_id) + if snapshot_dir is None: + return False + return validate_local_diffusers_snapshot(snapshot_dir, repo_id) is None + + +def _video_repo_has_any_local_data(repo_id: str) -> bool: + """True if we have a partial or complete snapshot on disk. + + Distinct from ``_video_repo_runtime_ready`` — this is the softer signal used + to tell the UI "something downloaded for this repo" even if it's incomplete. + """ + snapshot_dir = _hf_repo_snapshot_dir(repo_id) + if snapshot_dir is None: + return False + root = Path(snapshot_dir) + if not root.exists(): + return False + try: + return any( + candidate.is_file() or candidate.is_symlink() + for candidate in root.iterdir() + if not candidate.name.startswith(".") + ) + except OSError: + return False + + +def _video_variant_available_locally(variant: dict[str, Any]) -> bool: + repo = str(variant.get("repo") or "") + if not repo: + return False + return _video_repo_runtime_ready(repo) + + +def _video_download_repo_ids() -> set[str]: + return { + str(variant.get("repo") or "") + for family in VIDEO_MODEL_FAMILIES + for variant in family["variants"] + if str(variant.get("repo") or "") + } + + +# Diffusers pipelines only need the standard per-component folders +# (scheduler/, text_encoder/, tokenizer/, transformer/ or unet/, vae/) +# plus ``model_index.json`` at the root. Video repos frequently ship +# historical checkpoints (``ltx-video-0.9.safetensors`` and friends) as +# siblings — without an allowlist ``snapshot_download`` pulls every one +# of them, which can inflate a 2 GB diffusers pipeline into a 200 GB +# download. Keep this list conservative so future component folders still +# come through, but block the legacy standalone safetensors. +_VIDEO_DIFFUSERS_ALLOW_PATTERNS: list[str] = [ + "model_index.json", + "scheduler/**", + "text_encoder/**", + "text_encoder_2/**", + "text_encoder_3/**", + "tokenizer/**", + "tokenizer_2/**", + "tokenizer_3/**", + "transformer/**", + "transformer_2/**", + "unet/**", + "vae/**", + "feature_extractor/**", + "image_encoder/**", + "safety_checker/**", + "*.md", + "LICENSE*", +] + + +def _video_repo_allow_patterns(repo_id: str) -> list[str] | None: + """Patterns to pass to ``snapshot_download`` for a video repo. + + Returns ``None`` for non-video repos so the caller can pass the value + through unconditionally without special-casing. For video repos the + allowlist keeps the download scoped to the diffusers pipeline layout + — see the comment on ``_VIDEO_DIFFUSERS_ALLOW_PATTERNS`` for why this + matters. + """ + if not _is_video_repo(repo_id): + return None + return list(_VIDEO_DIFFUSERS_ALLOW_PATTERNS) + + +def _video_download_validation_error(repo_id: str) -> str | None: + if not _is_video_repo(repo_id): + return None + snapshot_dir = _hf_repo_snapshot_dir(repo_id) + if snapshot_dir is None: + return ( + f"Download did not produce a local snapshot for {repo_id}. " + "Retry the download and make sure the backend can access Hugging Face." + ) + return validate_local_diffusers_snapshot(snapshot_dir, repo_id) + + +# ---- Video output CRUD ---- +# +# Video artifacts differ from image artifacts in one important way: an mp4 is +# the real deliverable and there's no cheap "preview" we can embed inline. The +# frontend loads the file directly via a dedicated ``/file`` endpoint rather +# than getting a base64 data URL in the list payload. + + +def _video_output_directory(video_outputs_dir: Path, created_at: str | None = None) -> Path: + day_label = (created_at or datetime.utcnow().isoformat())[:10] + output_dir = video_outputs_dir / day_label + output_dir.mkdir(parents=True, exist_ok=True) + return output_dir + + +def _hydrate_video_artifact(payload: dict[str, Any]) -> dict[str, Any]: + prompt = str(payload.get("prompt") or "") + model_name = str(payload.get("modelName") or payload.get("modelId") or "Video model") + return { + "artifactId": str(payload.get("artifactId") or ""), + "modelId": str(payload.get("modelId") or ""), + "modelName": model_name, + "prompt": prompt, + "negativePrompt": str(payload.get("negativePrompt") or ""), + "width": int(payload.get("width") or 768), + "height": int(payload.get("height") or 512), + "numFrames": int(payload.get("numFrames") or 0), + "fps": int(payload.get("fps") or 24), + "steps": int(payload.get("steps") or 0), + "guidance": float(payload.get("guidance") or 0.0), + "seed": int(payload.get("seed") or 0), + "createdAt": str( + payload.get("createdAt") or datetime.utcnow().replace(microsecond=0).isoformat() + "Z" + ), + "durationSeconds": float(payload.get("durationSeconds") or 0.0), + "clipDurationSeconds": float(payload.get("clipDurationSeconds") or 0.0), + "videoPath": str(payload.get("videoPath") or "") or None, + "metadataPath": str(payload.get("metadataPath") or "") or None, + "videoMimeType": str(payload.get("videoMimeType") or "video/mp4"), + "videoExtension": str(payload.get("videoExtension") or "mp4"), + "runtimeLabel": str(payload.get("runtimeLabel") or ""), + "runtimeNote": str(payload.get("runtimeNote") or "") or None, + } + + +def _save_video_artifact(artifact: dict[str, Any], video_outputs_dir: Path) -> dict[str, Any]: + created_at = str( + artifact.get("createdAt") or datetime.utcnow().replace(microsecond=0).isoformat() + "Z" + ) + output_dir = _video_output_directory(video_outputs_dir, created_at) + artifact_id = str(artifact["artifactId"]) + extension = str(artifact.get("videoExtension") or "mp4").lstrip(".") + video_path = output_dir / f"{artifact_id}.{extension}" + metadata_path = output_dir / f"{artifact_id}.json" + + video_bytes = artifact.get("videoBytes") + if isinstance(video_bytes, str): + video_bytes = base64.b64decode(video_bytes.encode("ascii")) + if isinstance(video_bytes, (bytes, bytearray)): + video_path.write_bytes(bytes(video_bytes)) + else: + raise ValueError( + "Cannot persist video artifact: no raw bytes supplied. " + "Pass `videoBytes` as bytes from the generation pipeline." + ) + + persisted = { + **artifact, + "videoPath": str(video_path), + "metadataPath": str(metadata_path), + } + metadata_payload = { + key: value + for key, value in persisted.items() + if key not in {"videoBytes", "videoMimeType", "videoExtension"} + } + metadata_path.write_text(json.dumps(metadata_payload, indent=2), encoding="utf-8") + return _hydrate_video_artifact(persisted) + + +def _load_video_outputs(video_outputs_dir: Path) -> list[dict[str, Any]]: + if not video_outputs_dir.exists(): + return [] + outputs: list[dict[str, Any]] = [] + for metadata_path in video_outputs_dir.rglob("*.json"): + try: + payload = json.loads(metadata_path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError): + continue + if not isinstance(payload, dict): + continue + outputs.append(_hydrate_video_artifact({**payload, "metadataPath": str(metadata_path)})) + outputs.sort(key=lambda item: str(item.get("createdAt") or ""), reverse=True) + return outputs + + +def _find_video_output(artifact_id: str, video_outputs_dir: Path) -> dict[str, Any] | None: + for output in _load_video_outputs(video_outputs_dir): + if output.get("artifactId") == artifact_id: + return output + return None + + +def _delete_video_output(artifact_id: str, video_outputs_dir: Path) -> bool: + if not video_outputs_dir.exists(): + return False + found = False + for metadata_path in video_outputs_dir.rglob(f"{artifact_id}.json"): + found = True + video_path = metadata_path.with_suffix(".mp4") + try: + payload = json.loads(metadata_path.read_text(encoding="utf-8")) + if isinstance(payload, dict) and payload.get("videoPath"): + video_path = Path(str(payload["videoPath"])) + except (OSError, json.JSONDecodeError): + pass + try: + metadata_path.unlink(missing_ok=True) + except OSError: + pass + try: + video_path.unlink(missing_ok=True) + except OSError: + pass + return found diff --git a/backend_service/image_runtime.py b/backend_service/image_runtime.py index 4c456cb..b4f4cf4 100644 --- a/backend_service/image_runtime.py +++ b/backend_service/image_runtime.py @@ -4,16 +4,28 @@ import importlib.util import io import os +import platform import textwrap import time import gc import secrets + +from backend_service.helpers.gpu import nvidia_gpu_present as _nvidia_gpu_present from colorsys import hsv_to_rgb from dataclasses import asdict, dataclass, field from pathlib import Path from threading import RLock from typing import Any +from backend_service.progress import ( + IMAGE_PROGRESS, + PHASE_DECODING, + PHASE_DIFFUSING, + PHASE_ENCODING, + PHASE_LOADING, + PHASE_SAVING, +) + WORKSPACE_ROOT = Path(__file__).resolve().parents[1] MAX_IMAGE_SEED = 2147483647 @@ -50,6 +62,59 @@ def validate_local_diffusers_snapshot(local_root: Path, repo: str | None = None) f"(missing model_index.json; found {visible_label}). {_snapshot_retry_guidance(repo)}" ) + # Verify each component listed in model_index.json actually has its folder + # on disk with a recognisable config file. Diffusers will otherwise raise a + # cryptic "no file named config.json found in directory " + # error from inside ``from_pretrained`` that points at the snapshot root, + # which is hard to action without knowing which subfolder is missing. + # This typically happens when a download started before allow_patterns was + # applied — HF queues the legacy root-level safetensors first and the user + # tries to load before the per-component folders finish landing. + try: + pipeline_index = json.loads(model_index_path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError) as exc: + return ( + "The local snapshot's model_index.json could not be read " + f"({exc}). {_snapshot_retry_guidance(repo)}" + ) + + missing_components: list[str] = [] + if isinstance(pipeline_index, dict): + # Any of these names being present in a subfolder is enough to call it + # a real component directory — diffusers picks the right one based on + # the class type at load time. + component_config_names = ( + "config.json", + "scheduler_config.json", + "tokenizer_config.json", + "preprocessor_config.json", + ) + for component_name, descriptor in pipeline_index.items(): + if component_name.startswith("_"): + continue # ``_class_name`` / ``_diffusers_version`` metadata + if not isinstance(descriptor, (list, tuple)) or len(descriptor) < 2: + continue + # Pipelines list ``[null, null]`` for optional components that the + # checkpoint deliberately omits (e.g. safety_checker on community + # models). Skip those — they aren't expected on disk. + if descriptor[0] is None or descriptor[1] is None: + continue + component_dir = local_root / component_name + if not component_dir.is_dir(): + missing_components.append(component_name) + continue + if not any((component_dir / name).exists() for name in component_config_names): + missing_components.append(component_name) + + if missing_components: + label = ", ".join(missing_components[:4]) + if len(missing_components) > 4: + label += f" (+{len(missing_components) - 4} more)" + return ( + "The local snapshot is incomplete and cannot be opened as a diffusers pipeline " + f"(missing components: {label}). {_snapshot_retry_guidance(repo)}" + ) + broken_links: list[str] = [] weight_index_paths: list[Path] = [] try: @@ -338,73 +403,144 @@ def probe(self) -> ImageRuntimeStatus: ) device = self._detect_device(torch) + message = ( + "Real local generation is available. Download an image model locally, then Image Studio " + "will use the diffusers runtime instead of the placeholder engine." + ) + # A CPU-only torch on a machine with an NVIDIA GPU is the single + # most common "image gen takes 10 minutes per step" misconfiguration + # on Windows and Linux. Detect the NVIDIA driver via nvidia-smi and, + # if torch didn't pick up CUDA, surface an actionable hint instead + # of letting users watch the progress bar crawl. + if device == "cpu" and platform.system() in ("Windows", "Linux") and _nvidia_gpu_present(): + message = ( + "torch was imported but CUDA is unavailable — diffusion will run on CPU " + "(expect minutes per step). Reinstall with the CUDA wheel: " + "pip install --upgrade --force-reinstall torch " + "--index-url https://download.pytorch.org/whl/cu121" + ) return ImageRuntimeStatus( activeEngine="diffusers", realGenerationAvailable=True, device=device, pythonExecutable=_resolve_image_python(), - message=( - "Real local generation is available. Download an image model locally, then Image Studio " - "will use the diffusers runtime instead of the placeholder engine." - ), + message=message, loadedModelRepo=self._loaded_repo, ) def generate(self, config: ImageGenerationConfig) -> list[GeneratedImage]: - pipeline = self._ensure_pipeline(config.repo) - torch = self._torch - if torch is None: - raise RuntimeError("PyTorch was not initialised for the diffusers runtime.") - generator_device = "cpu" if self._device == "mps" else (self._device or "cpu") - base_seed = _resolve_base_seed(config.seed) - generators = [ - torch.Generator(device=generator_device).manual_seed(base_seed + index) - for index in range(config.batchSize) - ] - - kwargs = self._build_pipeline_kwargs(config, generators if len(generators) > 1 else generators[0]) - lowered_repo = config.repo.lower() - if "flux" in lowered_repo: - kwargs.pop("negative_prompt", None) - kwargs["num_inference_steps"] = min(config.steps, 8) - if "turbo" in lowered_repo: - kwargs["num_inference_steps"] = min(config.steps, 8) - kwargs["guidance_scale"] = min(config.guidance, 2.5) - - started = time.perf_counter() + # Begin reporting progress before we touch the pipeline. ``_ensure_pipeline`` + # publishes its own ``loading`` phase if it actually has to materialise + # the pipeline, but we still want a tracker entry from the moment the + # request lands so the UI's first poll has something to render. + IMAGE_PROGRESS.begin( + run_label=self._format_run_label(config), + total_steps=max(1, int(config.steps)), + phase=PHASE_LOADING, + message=f"Preparing {config.modelName}", + ) try: - result = pipeline(**kwargs) - except TypeError: - kwargs.pop("negative_prompt", None) - result = pipeline(**kwargs) - elapsed = max(0.1, time.perf_counter() - started) - - artifacts: list[GeneratedImage] = [] - for index, image in enumerate(getattr(result, "images", []) or []): - if image.mode != "RGB": - image = image.convert("RGB") - if image.getbbox() is None: - raise RuntimeError( - "The image runtime returned an all-black frame instead of a real image. " - f"Model: {config.repo}. Device: {self._device or 'cpu'}. " - "Try restarting the backend and generating again. If this keeps happening on Apple Silicon, " - "the model likely needs a safer precision path." - ) - buffer = io.BytesIO() - image.save(buffer, format="PNG", optimize=True) - artifacts.append( - GeneratedImage( - seed=base_seed + index, - bytes=buffer.getvalue(), - extension="png", - mimeType="image/png", - durationSeconds=round(elapsed / max(1, config.batchSize), 1), - runtimeLabel=f"{self.runtime_label} ({self._device or 'cpu'})", - ) + pipeline = self._ensure_pipeline(config.repo) + torch = self._torch + if torch is None: + raise RuntimeError("PyTorch was not initialised for the diffusers runtime.") + IMAGE_PROGRESS.set_phase(PHASE_ENCODING, message="Encoding prompt") + generator_device = "cpu" if self._device == "mps" else (self._device or "cpu") + base_seed = _resolve_base_seed(config.seed) + generators = [ + torch.Generator(device=generator_device).manual_seed(base_seed + index) + for index in range(config.batchSize) + ] + + kwargs = self._build_pipeline_kwargs(config, generators if len(generators) > 1 else generators[0]) + lowered_repo = config.repo.lower() + if "flux" in lowered_repo: + kwargs.pop("negative_prompt", None) + kwargs["num_inference_steps"] = min(config.steps, 8) + if "turbo" in lowered_repo: + kwargs["num_inference_steps"] = min(config.steps, 8) + kwargs["guidance_scale"] = min(config.guidance, 2.5) + + # Wire the diffusers per-step callback so the UI sees the bar move + # in lockstep with denoising, which is the bulk of the wall time on + # most models. ``callback_on_step_end`` is the non-deprecated name + # in modern diffusers (>=0.27); some pipelines also accept the + # legacy ``callback`` arg, but we prefer the new one. + total_steps = int(kwargs.get("num_inference_steps", config.steps) or config.steps) + IMAGE_PROGRESS.set_phase( + PHASE_DIFFUSING, + message=self._diffuse_message(config), ) - if not artifacts: - raise RuntimeError("Diffusers returned no images.") - return artifacts + # Re-publish the totalSteps in case ``num_inference_steps`` was + # clamped above (Flux/Turbo cap at 8). + IMAGE_PROGRESS.set_step(0, total=max(1, total_steps)) + + def _on_step_end(_pipeline: Any, step: int, _timestep: Any, callback_kwargs: dict[str, Any]): + # Diffusers calls this *after* step ``step`` finishes, so step + # 0 means "one step done". Convert to the 1-indexed value the + # UI wants to display. + IMAGE_PROGRESS.set_step(step + 1, total=max(1, total_steps)) + return callback_kwargs + + kwargs.setdefault("callback_on_step_end", _on_step_end) + + started = time.perf_counter() + try: + result = pipeline(**kwargs) + except TypeError as exc: + # Older diffusers versions don't accept ``callback_on_step_end`` + # — drop it and retry once before bubbling the original error. + if "callback_on_step_end" in str(exc): + kwargs.pop("callback_on_step_end", None) + try: + result = pipeline(**kwargs) + except TypeError: + kwargs.pop("negative_prompt", None) + result = pipeline(**kwargs) + else: + kwargs.pop("negative_prompt", None) + result = pipeline(**kwargs) + elapsed = max(0.1, time.perf_counter() - started) + + IMAGE_PROGRESS.set_phase(PHASE_DECODING, message="Decoding pixels") + + artifacts: list[GeneratedImage] = [] + for index, image in enumerate(getattr(result, "images", []) or []): + if image.mode != "RGB": + image = image.convert("RGB") + if image.getbbox() is None: + raise RuntimeError( + "The image runtime returned an all-black frame instead of a real image. " + f"Model: {config.repo}. Device: {self._device or 'cpu'}. " + "Try restarting the backend and generating again. If this keeps happening on Apple Silicon, " + "the model likely needs a safer precision path." + ) + buffer = io.BytesIO() + image.save(buffer, format="PNG", optimize=True) + artifacts.append( + GeneratedImage( + seed=base_seed + index, + bytes=buffer.getvalue(), + extension="png", + mimeType="image/png", + durationSeconds=round(elapsed / max(1, config.batchSize), 1), + runtimeLabel=f"{self.runtime_label} ({self._device or 'cpu'})", + ) + ) + if not artifacts: + raise RuntimeError("Diffusers returned no images.") + IMAGE_PROGRESS.set_phase(PHASE_SAVING, message="Saving to gallery") + return artifacts + finally: + IMAGE_PROGRESS.finish() + + def _diffuse_message(self, config: ImageGenerationConfig) -> str: + if config.batchSize > 1: + return f"Diffusing {config.batchSize} images" + return "Diffusing image" + + def _format_run_label(self, config: ImageGenerationConfig) -> str: + return f"{config.modelName} · {config.width}x{config.height}" def preload(self, repo: str) -> ImageRuntimeStatus: self._ensure_pipeline(repo) @@ -422,6 +558,11 @@ def _ensure_pipeline(self, repo: str) -> Any: if self._pipeline is not None and self._loaded_repo == repo: return self._pipeline + # Loading a pipeline can take 10-60s on cold disk. Surface that + # explicitly to the UI so the progress bar stops sitting at 0% + # while we read 5GB of weights from the SSD. + IMAGE_PROGRESS.set_phase(PHASE_LOADING, message=f"Loading {repo}") + if self._pipeline is not None and self._loaded_repo != repo: self._release_pipeline() diff --git a/backend_service/inference.py b/backend_service/inference.py index 9f33c9b..5a05a7c 100644 --- a/backend_service/inference.py +++ b/backend_service/inference.py @@ -1704,7 +1704,7 @@ def _build_command( *fell_back_to_native* is ``True`` when pre-validation detected unsupported cache types and silently switched to f16. """ - from compression import registry as _strategy_registry + from cache_compression import registry as _strategy_registry strategy = _strategy_registry.get(cache_strategy) or _strategy_registry.default() binary = self._select_llama_binary(strategy) @@ -1842,7 +1842,7 @@ def load_model( runtime_note = None actual_strategy = cache_strategy actual_fit = fit_model_in_memory - from compression import registry as _strategy_registry + from cache_compression import registry as _strategy_registry failed_strategy_name: str | None = None # Try the requested strategy first. If it fails, try ChaosEngine diff --git a/backend_service/mlx_worker.py b/backend_service/mlx_worker.py index 92c6dcb..a87a4b2 100644 --- a/backend_service/mlx_worker.py +++ b/backend_service/mlx_worker.py @@ -780,7 +780,7 @@ def _runtime_fields( def _make_cache(self) -> tuple[Any | None, str | None]: """Build the prompt cache for the active strategy. Returns (cache, note).""" - from compression import registry + from cache_compression import registry strategy = registry.get(self.cache_strategy) if strategy is None or self.cache_strategy == "native": return None, None @@ -801,7 +801,7 @@ def _make_cache(self) -> tuple[Any | None, str | None]: def _generate_dflash(self, request: dict[str, Any]) -> dict[str, Any]: """Generate using DFLASH speculative decoding.""" - from dflash_mlx.runtime import generate_dflash_once + from dflash_mlx.runtime import stream_dflash_generate # Build prompt text system_prompt = request.get("systemPrompt") @@ -822,7 +822,11 @@ def _generate_dflash(self, request: dict[str, Any]) -> dict[str, Any]: if eos_token_id is not None and int(eos_token_id) not in eos_token_ids: eos_token_ids.append(int(eos_token_id)) - summary = generate_dflash_once( + # ``stream_dflash_generate`` (upstream v0.1.4) yields per-token events + # followed by a final ``{"event": "summary", ...}`` payload whose shape + # matches what the old ``generate_dflash_once`` helper returned. + summary: dict[str, Any] = {} + for event in stream_dflash_generate( target_model=self._dflash_target or self.model, tokenizer=self.tokenizer, draft_model=self._dflash_generator, @@ -831,7 +835,9 @@ def _generate_dflash(self, request: dict[str, Any]) -> dict[str, Any]: use_chat_template=False, stop_token_ids=eos_token_ids, prompt_tokens_override=prompt_tokens, - ) + ): + if event.get("event") == "summary": + summary = dict(event) gen_tokens = [int(token_id) for token_id in summary.get("generated_token_ids", [])] text = self.tokenizer.decode(gen_tokens).strip() if gen_tokens else "" diff --git a/backend_service/models/__init__.py b/backend_service/models/__init__.py index 8b5cd6d..20af2dc 100644 --- a/backend_service/models/__init__.py +++ b/backend_service/models/__init__.py @@ -111,11 +111,16 @@ class UpdateSettingsRequest(BaseModel): modelDirectories: list[ModelDirectoryRequest] | None = None preferredServerPort: int | None = Field(default=None, ge=1024, le=65535) allowRemoteConnections: bool | None = None + requireApiAuth: bool | None = None autoStartServer: bool | None = None launchPreferences: LaunchPreferencesRequest | None = None remoteProviders: list[RemoteProviderRequest] | None = None huggingFaceToken: str | None = Field(default=None, max_length=512) dataDirectory: str | None = Field(default=None, max_length=4096) + # Per-modality output overrides. Empty string clears the override and + # restores the default (data-dir/images/outputs or data-dir/videos/outputs). + imageOutputsDirectory: str | None = Field(default=None, max_length=4096) + videoOutputsDirectory: str | None = Field(default=None, max_length=4096) class OpenAIMessage(BaseModel): @@ -207,3 +212,30 @@ class ImageRuntimePreloadRequest(BaseModel): class ImageRuntimeUnloadRequest(BaseModel): modelId: str | None = Field(default=None, min_length=1, max_length=256) + + +class VideoRuntimePreloadRequest(BaseModel): + modelId: str = Field(min_length=1, max_length=256) + + +class VideoRuntimeUnloadRequest(BaseModel): + modelId: str | None = Field(default=None, min_length=1, max_length=256) + + +class VideoGenerationRequest(BaseModel): + """Shape accepted by POST /api/video/generate. + + Defaults are intentionally conservative — num_frames and steps in particular + dominate generation time on consumer hardware, so we err on the side of a + short, fast clip and let the user dial up quality from the Studio UI. + """ + modelId: str = Field(min_length=1, max_length=256) + prompt: str = Field(min_length=1, max_length=4000) + negativePrompt: str | None = Field(default=None, max_length=4000) + width: int = Field(default=768, ge=256, le=2048) + height: int = Field(default=512, ge=256, le=2048) + numFrames: int = Field(default=97, ge=8, le=257) + fps: int = Field(default=24, ge=1, le=60) + steps: int = Field(default=50, ge=1, le=100) + guidance: float = Field(default=3.0, ge=1.0, le=20.0) + seed: int | None = Field(default=None, ge=0, le=2147483647) diff --git a/backend_service/plugins/__init__.py b/backend_service/plugins/__init__.py index 79c8814..f218910 100644 --- a/backend_service/plugins/__init__.py +++ b/backend_service/plugins/__init__.py @@ -88,7 +88,7 @@ def discover_from_directory(self, plugins_dir: Path): def register_builtins(self): """Register all built-in components as plugins.""" # Cache strategies - from compression import registry as cache_registry + from cache_compression import registry as cache_registry for strategy in cache_registry._strategies.values(): manifest = PluginManifest( id=f"cache.{strategy.strategy_id}", diff --git a/backend_service/progress.py b/backend_service/progress.py new file mode 100644 index 0000000..6d5b773 --- /dev/null +++ b/backend_service/progress.py @@ -0,0 +1,155 @@ +"""Real-time generation progress tracking for image + video runtimes. + +The diffusers pipelines used by ``image_runtime`` and ``video_runtime`` each +take 30 seconds to several minutes to finish. The frontend used to render an +arbitrary "estimated seconds" bar that drifted out of sync with reality on +slower hardware. This module gives the runtimes a tiny thread-safe scratchpad +they can update as they progress, and the routes a way to report that state +back to the UI so the progress bar reflects what's actually happening. + +A tracker exposes four operations: + +* ``begin(...)`` — call when generation starts. Resets state and stamps the + start time. +* ``set_phase(...)`` — call when the runtime moves into a new phase + (``loading``, ``encoding``, ``diffusing``, ``decoding``, ``saving``). The + string is opaque to the backend — the frontend maps it onto the same phase + IDs the modal already understands. +* ``set_step(step, total)`` — call inside ``callback_on_step_end`` to publish + per-step progress during diffusion. +* ``finish(...)`` — call after the pipeline returns (or raises). Marks the + tracker idle so the next poll cycle stops showing stale values. + +``snapshot()`` returns a JSON-serialisable dict the routes hand back to the +frontend. ``active=False`` means "no run in flight" — callers should fall +back to client-side estimates. + +Two module-level singletons (`IMAGE_PROGRESS`, `VIDEO_PROGRESS`) are exposed +so the runtimes and routes share the same instance without the +``ChaosEngineState`` plumbing having to know about it. +""" + +from __future__ import annotations + +import time +from threading import RLock +from typing import Any + + +# Phase IDs the frontend expects. Keep these in sync with the modal's phase +# list — adding a new phase here without updating the modal will just show up +# as "unknown phase" in the UI but won't crash. +PHASE_IDLE = "idle" +PHASE_LOADING = "loading" +PHASE_ENCODING = "encoding" +PHASE_DIFFUSING = "diffusing" +PHASE_DECODING = "decoding" +PHASE_SAVING = "saving" + + +class ProgressTracker: + """Thread-safe scratchpad for one in-flight generation at a time. + + The runtimes are already serialised through their own ``RLock``s — only + one image (or one video) can render at a time per process — so we don't + need to multiplex multiple runs. We just need the GET endpoint and the + pipeline callback to read/write the same state without tearing. + """ + + def __init__(self, *, kind: str) -> None: + self._lock = RLock() + # ``kind`` is included in the snapshot so logs can tell image and + # video apart at a glance. + self._kind = kind + self._active = False + self._phase = PHASE_IDLE + self._message = "" + self._step = 0 + self._total_steps = 0 + self._started_at = 0.0 + self._updated_at = 0.0 + # Optional run-shape metadata so the UI can render labels like + # "Diffusing 3 images" without a separate request. + self._run_label: str | None = None + + def begin( + self, + *, + run_label: str | None = None, + total_steps: int = 0, + phase: str = PHASE_LOADING, + message: str = "", + ) -> None: + with self._lock: + now = time.time() + self._active = True + self._phase = phase + self._message = message + self._step = 0 + self._total_steps = max(0, int(total_steps)) + self._started_at = now + self._updated_at = now + self._run_label = run_label + + def set_phase(self, phase: str, message: str = "") -> None: + """Move into a new phase. Resets ``step`` so per-phase progress is + measured from zero rather than carrying over the previous phase's + counter.""" + with self._lock: + if not self._active: + # Setting a phase before ``begin()`` is meaningless — it would + # leave ``started_at`` at 0 and the elapsed time would be + # nonsense. Treat it as an implicit ``begin`` so callers don't + # have to remember the order on simple paths. + self._active = True + self._started_at = time.time() + self._step = 0 + self._total_steps = 0 + self._run_label = None + self._phase = phase + self._message = message + self._step = 0 + self._updated_at = time.time() + + def set_step(self, step: int, total: int | None = None) -> None: + with self._lock: + if not self._active: + return + self._step = max(0, int(step)) + if total is not None: + self._total_steps = max(0, int(total)) + self._updated_at = time.time() + + def finish(self, *, message: str = "") -> None: + with self._lock: + self._active = False + self._phase = PHASE_IDLE + self._message = message + self._step = 0 + self._total_steps = 0 + self._updated_at = time.time() + self._run_label = None + + def snapshot(self) -> dict[str, Any]: + with self._lock: + now = time.time() + elapsed = max(0.0, now - self._started_at) if self._active else 0.0 + return { + "kind": self._kind, + "active": self._active, + "phase": self._phase, + "message": self._message, + "step": self._step, + "totalSteps": self._total_steps, + "startedAt": self._started_at if self._active else 0.0, + "updatedAt": self._updated_at, + "elapsedSeconds": round(elapsed, 3), + "runLabel": self._run_label, + } + + +# Module-level singletons. The runtime managers and the route handlers both +# import these directly so we don't have to thread the tracker through +# ``ChaosEngineState`` constructor signatures. +IMAGE_PROGRESS = ProgressTracker(kind="image") +VIDEO_PROGRESS = ProgressTracker(kind="video") diff --git a/backend_service/routes/__init__.py b/backend_service/routes/__init__.py index 65dde43..aec2ec6 100644 --- a/backend_service/routes/__init__.py +++ b/backend_service/routes/__init__.py @@ -11,6 +11,7 @@ def register_routes(app: FastAPI) -> None: from .models import router as models_router from .chat import router as chat_router from .images import router as images_router + from .video import router as video_router from .benchmarks import router as benchmarks_router from .cache import router as cache_router from .server import router as server_router @@ -29,6 +30,7 @@ def register_routes(app: FastAPI) -> None: app.include_router(chat_router) app.include_router(compare_router) app.include_router(images_router) + app.include_router(video_router) app.include_router(benchmarks_router) app.include_router(cache_router) app.include_router(server_router) diff --git a/backend_service/routes/health.py b/backend_service/routes/health.py index aac21a3..8ddf97e 100644 --- a/backend_service/routes/health.py +++ b/backend_service/routes/health.py @@ -4,6 +4,7 @@ from fastapi import APIRouter, Request +from backend_service.helpers.gpu import gpu_status_snapshot from backend_service.helpers.system import _runtime_label router = APIRouter() @@ -42,3 +43,15 @@ def runtime_status(request: Request) -> dict[str, Any]: active_requests=state.active_requests, requests_served=state.requests_served, ) + + +@router.get("/api/system/gpu-status") +def system_gpu_status() -> dict[str, Any]: + """Unified GPU availability summary for the frontend warning banner. + + Returns whether torch sees CUDA / MPS on the current host, whether an + NVIDIA driver is visible on ``PATH``, and a human-readable recommendation + string when torch fell back to CPU on a box that clearly has an NVIDIA + GPU. Safe to call before any model is loaded. + """ + return gpu_status_snapshot() diff --git a/backend_service/routes/images.py b/backend_service/routes/images.py index f91396a..0a27c88 100644 --- a/backend_service/routes/images.py +++ b/backend_service/routes/images.py @@ -25,6 +25,7 @@ _find_image_output, _delete_image_output, ) +from backend_service.progress import IMAGE_PROGRESS router = APIRouter() @@ -45,6 +46,17 @@ def image_runtime_status(request: Request) -> dict[str, Any]: return {"runtime": state.image_runtime.capabilities()} +@router.get("/api/images/progress") +def image_generation_progress() -> dict[str, Any]: + """Live progress snapshot for the in-flight image generation. + + Polled by the generation modal every ~500ms while the bar is visible. + When ``active`` is false the UI falls back to its own client-side + estimates rather than freezing the bar at 0%. + """ + return {"progress": IMAGE_PROGRESS.snapshot()} + + @router.post("/api/images/preload") def preload_image_model(request: Request, body: ImageRuntimePreloadRequest) -> dict[str, Any]: import traceback as _tb diff --git a/backend_service/routes/settings.py b/backend_service/routes/settings.py index b08139a..8e37f70 100644 --- a/backend_service/routes/settings.py +++ b/backend_service/routes/settings.py @@ -19,4 +19,10 @@ def settings(request: Request) -> dict[str, Any]: @router.patch("/api/settings") def update_settings(request: Request, body: UpdateSettingsRequest) -> dict[str, Any]: state = request.app.state.chaosengine - return state.update_settings(body) + result = state.update_settings(body) + # Hot-apply the API-auth toggle — the middleware reads this flag per + # request, so flipping it should take effect immediately without + # forcing the user to restart the server. Env var, if set, still wins. + from backend_service.app import _resolve_require_api_auth + request.app.state.chaosengine_require_api_auth = _resolve_require_api_auth(state.settings) + return result diff --git a/backend_service/routes/setup.py b/backend_service/routes/setup.py index 4c75276..e814d62 100644 --- a/backend_service/routes/setup.py +++ b/backend_service/routes/setup.py @@ -1,5 +1,6 @@ from __future__ import annotations +import shutil import subprocess import time from pathlib import Path @@ -17,8 +18,46 @@ "vllm": "vllm", "mlx": "mlx", "mlx-lm": "mlx-lm", - "dflash-mlx": "dflash-mlx", + # PyPI build is stale at 0.1.0; the up-to-date code lives on GitHub. + # The upstream removed all tags in April 2026, so we pin to a specific + # commit on main instead — v0.1.4 no longer resolves and fresh clones + # failed with "pathspec 'v0.1.4' did not match any file(s) known to + # git". Bump the pin when we validate a newer main SHA. + "dflash-mlx": "dflash-mlx @ git+https://github.com/bstnxbt/dflash-mlx.git@f825ffb268e50d531e8b6524413b0847334a14dd", "dflash": "dflash", + # Video output encoding — diffusers can produce frames without these, + # but exporting mp4/gif requires imageio + the ffmpeg plugin. The Video + # Studio surfaces a one-click installer when they're missing. + "imageio": "imageio", + "imageio-ffmpeg": "imageio-ffmpeg", + # Pipeline-specific tokenizer / text-encoder packages. Diffusers itself + # imports without them, but individual video pipelines need one or more + # at preload / generate time: + # - tiktoken: LTX-Video's T5 tokenizer ships in tiktoken format. + # - sentencepiece: Wan (UMT5-XXL), HunyuanVideo, CogVideoX, Mochi (T5). + # - protobuf: SentencePiece tokenizers HF loads. + # - ftfy: prompt-text preprocessing several pipelines use. + "tiktoken": "tiktoken", + "sentencepiece": "sentencepiece", + "protobuf": "protobuf", + "ftfy": "ftfy", + # Core image / video runtime packages. Installed together via the + # one-click button in Image Studio / Video Studio when the probe + # reports the real engine as unavailable. Each is also individually + # installable so we can retry a single failed package without redoing + # the whole set. + # + # We deliberately do not pin versions here — the backend ships with + # ``pyproject.toml`` extras that constrain them, and a bare ``pip + # install diffusers`` resolves compatibly with whatever torch the user + # already has. For a coordinated install of all of these, the Studio + # calls this endpoint once per package in order so a single failure + # doesn't abort the whole sequence. + "diffusers": "diffusers", + "torch": "torch", + "accelerate": "accelerate", + "huggingface_hub": "huggingface_hub", + "pillow": "pillow", } _MANUAL_INSTALL_MESSAGES: dict[str, str] = { @@ -143,6 +182,230 @@ def refresh_capabilities_endpoint(request: Request) -> dict[str, Any]: return {"capabilities": caps.to_dict()} +# ------------------------------------------------------------------ +# CUDA torch install (Windows/Linux NVIDIA fallback recovery) +# ------------------------------------------------------------------ + +# cu124 covers Python 3.9-3.13 and driver 525+. cu121 only ships wheels +# for Python up to 3.12, so fresh Windows installs (3.13) fail on it. +# The nightly index sometimes has wheels for very new Python (e.g. 3.14) +# before they land in stable — we try it last so users on bleeding-edge +# Python aren't stuck. The endpoint walks this list in order and stops +# at the first success. +_CUDA_TORCH_INDEXES: list[str] = [ + "https://download.pytorch.org/whl/cu124", + "https://download.pytorch.org/whl/cu126", + "https://download.pytorch.org/whl/cu128", + "https://download.pytorch.org/whl/cu121", + "https://download.pytorch.org/whl/nightly/cu128", +] + + +def _read_python_version(python: str) -> str | None: + """Return e.g. ``3.13.2`` for the given Python interpreter, or ``None``.""" + try: + result = subprocess.run( + [python, "-c", "import sys; print('%d.%d.%d' % sys.version_info[:3])"], + capture_output=True, text=True, timeout=10, + ) + except (OSError, subprocess.TimeoutExpired): + return None + if result.returncode != 0: + return None + return result.stdout.strip() or None + + +def _site_packages_for(python_executable: str) -> Path | None: + """Return the site-packages directory for the given interpreter, or None.""" + try: + result = subprocess.run( + [ + python_executable, "-c", + "import sysconfig; print(sysconfig.get_paths().get('purelib') or sysconfig.get_paths().get('platlib') or '')", + ], + capture_output=True, text=True, timeout=10, + ) + except (OSError, subprocess.TimeoutExpired): + return None + if result.returncode != 0: + return None + path = (result.stdout or "").strip() + return Path(path) if path else None + + +def _purge_broken_distributions(site_packages: Path) -> list[str]: + """Delete ``~*`` stub directories pip leaves behind after an interrupted install. + + On Windows, pip atomically renames the old version of a package to ``~`` + before unpacking the new one. If the process is killed mid-install (antivirus, + a file lock, Ctrl-C) the stub is left behind. Subsequent ``pip install`` runs + then print ``WARNING: Ignoring invalid distribution ~arkupsafe`` forever and + sometimes refuse to heal the tree. Removing these stubs is cheap and safe — + they contain no authoritative data. + """ + if not site_packages.is_dir(): + return [] + removed: list[str] = [] + for entry in site_packages.iterdir(): + if not entry.name.startswith("~"): + continue + try: + if entry.is_dir(): + shutil.rmtree(entry, ignore_errors=True) + else: + entry.unlink(missing_ok=True) + if not entry.exists(): + removed.append(entry.name) + except OSError: + continue + return removed + + +def _all_attempts_lack_wheel(attempts: list[dict[str, Any]]) -> bool: + """True when pip reported 'No matching distribution' for every attempt. + + This is the signature of a Python version PyTorch doesn't ship wheels + for (either too old or too new) — the fix is a different Python, not + a different CUDA index. We surface that specifically to the UI so + the user doesn't keep retrying. + """ + if not attempts: + return False + for attempt in attempts: + if attempt.get("ok"): + return False + text = (attempt.get("output") or "").lower() + if "no matching distribution" not in text and "from versions: none" not in text: + return False + return True + + +@router.post("/api/setup/install-cuda-torch") +def install_cuda_torch(request: Request) -> dict[str, Any]: + """Install a CUDA-enabled torch wheel into the backend runtime. + + The fresh-Windows-install case is Python 3.13 + system pip, which has + no cu121 wheel at all — the install fails with "Could not find a + version that satisfies the requirement torch". We try cu124 first + (broadest Python 3.9-3.13 coverage), then cu126 / cu128 / cu121 in + case the user's driver doesn't match the newest, and finally the + nightly cu128 index for very-new Python (e.g. 3.14). + + If every attempt fails with "No matching distribution", we set + ``noWheelForPython`` in the response — that means the user's Python + version is the problem, not the CUDA index, so the UI can tell them + to switch Python rather than keep retrying. The response always + includes ``pythonVersion`` so the UI can show which interpreter this + is targeting (important: it's the app's bundled venv, not the system + pip the user might reach from a shell). + + Torch already imported in this process stays CPU until the user + restarts the backend — we flag ``requiresRestart`` in the response + so the frontend can prompt appropriately. + """ + state = request.app.state.chaosengine + python = state.runtime.capabilities.pythonExecutable + python_version = _read_python_version(python) + + # Sweep pip's "~" stub directories before attempting the install. + # These are left behind by a prior interrupted install (common on Windows + # where Defender briefly locks .pyd files), and they cause two problems: + # 1. Noisy "WARNING: Ignoring invalid distribution ~arkupsafe" spam that + # confuses users reading install output. + # 2. pip sometimes tries to repair them and fails with an "Access denied" + # write to a .pyd that the running backend process has loaded (e.g. + # markupsafe/_speedups.cp314-win_amd64.pyd via FastAPI -> Jinja2). + # Removing the stubs is always safe — they hold no authoritative data. + site_packages = _site_packages_for(python) + purged: list[str] = [] + if site_packages is not None: + purged = _purge_broken_distributions(site_packages) + if purged: + state.add_log( + "server", "info", + f"Removed {len(purged)} broken pip stub(s) from {site_packages}: {', '.join(purged)}", + ) + + attempts: list[dict[str, Any]] = [] + ok = False + winning_output = "" + winning_index: str | None = None + + for index_url in _CUDA_TORCH_INDEXES: + # Two-pass install: + # Pass 1: --force-reinstall --no-deps swaps the torch wheel (CPU -> CUDA) + # without overwriting transitive deps like markupsafe. Those + # extensions are loaded into this Python process via FastAPI + # -> Jinja2; overwriting their .pyd / .so at runtime raises + # WinError 5 "Access is denied" and aborts the install. + # Pass 2: plain install (no --force) fills in any genuinely missing + # deps (e.g. nvidia-cublas-cu12 on Linux when swapping from + # CPU torch) without touching files that are already satisfied. + cmd_swap = [ + python, "-m", "pip", "install", + "--upgrade", "--force-reinstall", "--no-deps", + "--index-url", index_url, + "torch>=2.4.0", + ] + state.add_log("server", "info", f"Installing CUDA torch from {index_url}") + try: + result = subprocess.run(cmd_swap, capture_output=True, text=True, timeout=900) + output = (result.stdout + "\n" + result.stderr).strip() + attempt_ok = result.returncode == 0 + except subprocess.TimeoutExpired: + output = f"Install from {index_url} timed out after 15 minutes." + attempt_ok = False + except OSError as exc: + output = f"{index_url}: {exc}" + attempt_ok = False + + if attempt_ok: + cmd_deps = [ + python, "-m", "pip", "install", + "--index-url", index_url, + "torch>=2.4.0", + ] + try: + dep_result = subprocess.run(cmd_deps, capture_output=True, text=True, timeout=900) + dep_output = (dep_result.stdout + "\n" + dep_result.stderr).strip() + output = f"{output}\n\n--- deps pass ---\n{dep_output}" if dep_output else output + except (subprocess.TimeoutExpired, OSError): + # Best-effort: torch itself swapped successfully, a missing + # transitive dep will surface at runtime via an ImportError + # the user can resolve from the Setup page. + pass + + attempts.append({"indexUrl": index_url, "ok": attempt_ok, "output": output}) + if attempt_ok: + ok = True + winning_output = output + winning_index = index_url + break + + # Re-probe so the UI can refresh its capabilities view. Note: torch + # already imported in this process is still the old module — the + # live cuda check won't flip to True without a restart. + state.runtime.refresh_capabilities(force=True) + caps = state.runtime.capabilities.to_dict() + no_wheel_for_python = (not ok) and _all_attempts_lack_wheel(attempts) + state.add_log( + "server", "info" if ok else "error", + f"CUDA torch install: {'succeeded via ' + winning_index if ok else 'failed after all candidates'}" + + (f" (no wheel for Python {python_version})" if no_wheel_for_python and python_version else ""), + ) + return { + "ok": ok, + "output": winning_output or (attempts[-1]["output"] if attempts else ""), + "indexUrl": winning_index, + "attempts": attempts, + "requiresRestart": ok, + "pythonExecutable": python, + "pythonVersion": python_version, + "noWheelForPython": no_wheel_for_python, + "capabilities": caps, + } + + # ------------------------------------------------------------------ # llama-server-turbo update check # ------------------------------------------------------------------ diff --git a/backend_service/routes/video.py b/backend_service/routes/video.py new file mode 100644 index 0000000..ada6da2 --- /dev/null +++ b/backend_service/routes/video.py @@ -0,0 +1,303 @@ +"""Video generation API routes. + +Backed by ``backend_service.video_runtime.VideoRuntimeManager``. This module +exposes the full preload / unload / download / generate / outputs lifecycle. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from fastapi import APIRouter, HTTPException, Request +from fastapi.responses import FileResponse + +from backend_service.helpers.video import ( + _find_video_variant, + _find_video_variant_by_repo, + _is_video_repo, + _video_download_repo_ids, + _video_download_validation_error, + _video_model_payloads, + _video_variant_available_locally, +) +from backend_service.models import ( + DownloadModelRequest, + VideoGenerationRequest, + VideoRuntimePreloadRequest, + VideoRuntimeUnloadRequest, +) +from backend_service.progress import VIDEO_PROGRESS + + +router = APIRouter() + + +@router.get("/api/video/catalog") +def video_catalog(request: Request) -> dict[str, Any]: + """Return the curated catalog of video generation models.""" + library = request.app.state.chaosengine._library() + return { + "families": _video_model_payloads(library), + "latest": [], + } + + +@router.get("/api/video/runtime") +def video_runtime_status(request: Request) -> dict[str, Any]: + """Report the live video runtime capability from diffusers + torch.""" + state = request.app.state.chaosengine + return {"runtime": state.video_runtime.capabilities()} + + +@router.get("/api/video/progress") +def video_generation_progress() -> dict[str, Any]: + """Live progress snapshot for the in-flight video generation. + + Same shape as ``/api/images/progress`` so the frontend can reuse the same + client code. Returns ``active=false`` when nothing is running so the UI + falls back to its estimate-driven view. + """ + return {"progress": VIDEO_PROGRESS.snapshot()} + + +@router.post("/api/video/preload") +def preload_video_model(request: Request, body: VideoRuntimePreloadRequest) -> dict[str, Any]: + state = request.app.state.chaosengine + state.add_log("video", "info", f"Preload requested: modelId='{body.modelId}'") + variant = _find_video_variant(body.modelId) + if variant is None: + state.add_log("video", "error", f"Preload failed: model '{body.modelId}' not found") + raise HTTPException(status_code=404, detail=f"Unknown video model '{body.modelId}'.") + + if not _video_variant_available_locally(variant): + validation_error = _video_download_validation_error(variant["repo"]) + detail = validation_error or f"{variant['name']} is not installed locally yet." + raise HTTPException(status_code=409, detail=detail) + + try: + runtime = state.video_runtime.preload(variant["repo"]) + except RuntimeError as exc: + state.add_log("video", "error", f"Failed to preload {variant['name']}: {exc}") + raise HTTPException(status_code=400, detail=f"Failed to load {variant['name']}: {exc}") from exc + except Exception as exc: + state.add_log( + "video", + "error", + f"Unexpected error preloading {variant['name']}: {type(exc).__name__}: {exc}", + ) + raise HTTPException( + status_code=500, + detail=f"Failed to load {variant['name']}: {type(exc).__name__}: {exc}", + ) from exc + + state.add_log("video", "info", f"Preloaded video model {variant['name']}.") + state.add_activity("Video model loaded", variant["name"]) + return {"runtime": runtime} + + +@router.post("/api/video/unload") +def unload_video_model(request: Request, body: VideoRuntimeUnloadRequest | None = None) -> dict[str, Any]: + state = request.app.state.chaosengine + requested_repo: str | None = None + requested_name: str | None = None + if body and body.modelId: + variant = _find_video_variant(body.modelId) + if variant is None: + raise HTTPException(status_code=404, detail=f"Unknown video model '{body.modelId}'.") + requested_repo = variant["repo"] + requested_name = variant["name"] + + current_runtime = state.video_runtime.capabilities() + current_repo = str(current_runtime.get("loadedModelRepo") or "") or None + try: + runtime = state.video_runtime.unload(requested_repo) + except RuntimeError as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc + + unloaded_repo = requested_repo or current_repo + if unloaded_repo and (requested_repo is None or requested_repo == current_repo): + unloaded_variant = _find_video_variant_by_repo(unloaded_repo) + unloaded_name = ( + unloaded_variant["name"] + if unloaded_variant + else requested_name or unloaded_repo + ) + state.add_log("video", "info", f"Unloaded video model {unloaded_name}.") + state.add_activity("Video model unloaded", unloaded_name) + return {"runtime": runtime} + + +@router.get("/api/video/library") +def video_library(request: Request) -> dict[str, Any]: + """Return the list of locally-installed video models.""" + state = request.app.state.chaosengine + library = state._library() + installed_models: list[dict[str, Any]] = [] + for family in _video_model_payloads(library): + for variant in family["variants"]: + if variant.get("availableLocally"): + installed_models.append(variant) + return {"models": installed_models} + + +@router.get("/api/video/outputs") +def video_outputs() -> dict[str, Any]: + """Return saved video outputs, newest first.""" + from backend_service.app import _load_video_outputs + return {"outputs": _load_video_outputs()} + + +@router.get("/api/video/outputs/{artifact_id}") +def video_output_detail(artifact_id: str) -> dict[str, Any]: + from backend_service.app import _find_video_output + output = _find_video_output(artifact_id) + if output is None: + raise HTTPException(status_code=404, detail=f"Video output '{artifact_id}' not found.") + return {"artifact": output} + + +@router.get("/api/video/outputs/{artifact_id}/file") +def video_output_file(artifact_id: str) -> FileResponse: + """Stream the mp4 for a saved video output. + + The frontend wires this up as the ``src`` of an HTML5