fix(workers): graceful GPU→CPU fallback for Whisper at load time

cuda_available() only covers "no GPU present". On a shared card the GPU can exist but fail to load the model (VRAM exhausted by another process e.g. Ollama). Try CUDA first, fall back to a CPU model on any load error instead of crashing the transcription job. Supports HA portability (node without GPU) and a contended GPU. Adds GPU-path + fallback tests. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 08:04:14 +10:00
parent 147b4f514c
commit 3c028fed5a
2 changed files with 49 additions and 10 deletions
--- a/workers/void_workers/model.py
+++ b/workers/void_workers/model.py
@@ -13,19 +13,32 @@ def cuda_available():
        return False


+def _load_whisper(device, compute_type):
+    from faster_whisper import WhisperModel
+    name = os.environ.get("WHISPER_MODEL", "small.en")
+    cache = os.environ.get("WHISPER_CACHE", "/var/lib/void/whisper-models")
+    log.info("whisper_loading", model=name, device=device,
+             compute_type=compute_type, cache=cache)
+    return WhisperModel(
+        name, device=device, compute_type=compute_type, download_root=cache
+    )
+
+
 def whisper_model():
    global _whisper_model
    if _whisper_model is None:
-        from faster_whisper import WhisperModel
-        name = os.environ.get("WHISPER_MODEL", "small.en")
-        cache = os.environ.get("WHISPER_CACHE", "/var/lib/void/whisper-models")
-        device = "cuda" if cuda_available() else "cpu"
-        compute_type = "float16" if device == "cuda" else "int8"
-        log.info("whisper_loading", model=name, device=device,
-                 compute_type=compute_type, cache=cache)
-        _whisper_model = WhisperModel(
-            name, device=device, compute_type=compute_type, download_root=cache
-        )
+        # Prefer the GPU when present, but fall back to CPU if the GPU is
+        # absent OR unusable at load time (e.g. VRAM already exhausted by
+        # another process sharing the card). HA portability + a shared GPU
+        # mean this must degrade gracefully, never hard-fail a transcription.
+        if cuda_available():
+            try:
+                _whisper_model = _load_whisper("cuda", "float16")
+            except Exception as e:
+                log.warning("whisper_cuda_load_failed_fallback_cpu", err=str(e))
+                _whisper_model = None
+        if _whisper_model is None:
+            _whisper_model = _load_whisper("cpu", "int8")
    return _whisper_model