fix(workers): graceful GPU→CPU fallback for Whisper at load time

cuda_available() only covers "no GPU present". On a shared card the GPU can exist but fail to load the model (VRAM exhausted by another process e.g. Ollama). Try CUDA first, fall back to a CPU model on any load error instead of crashing the transcription job. Supports HA portability (node without GPU) and a contended GPU. Adds GPU-path + fallback tests. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 08:04:14 +10:00
parent 147b4f514c
commit 3c028fed5a
2 changed files with 49 additions and 10 deletions
--- a/workers/tests/test_model.py
+++ b/workers/tests/test_model.py
@@ -12,6 +12,32 @@ def test_model_returns_singleton(monkeypatch):
            assert a is b


+def test_uses_gpu_when_available(monkeypatch):
+    monkeypatch.setattr(model, "_whisper_model", None)
+    with patch("void_workers.model.cuda_available", return_value=True):
+        with patch("faster_whisper.WhisperModel", return_value=MagicMock()) as WM:
+            model.whisper_model()
+            assert WM.call_args.kwargs["device"] == "cuda"
+            assert WM.call_args.kwargs["compute_type"] == "float16"
+
+
+def test_falls_back_to_cpu_when_cuda_load_fails(monkeypatch):
+    # GPU is present but the model fails to load (e.g. VRAM exhausted): must
+    # not raise — fall back to a CPU model instead of crashing the job.
+    monkeypatch.setattr(model, "_whisper_model", None)
+    cpu_model = MagicMock()
+
+    def fake_ctor(name, device, compute_type, download_root):
+        if device == "cuda":
+            raise RuntimeError("CUDA failed to allocate memory")
+        return cpu_model
+
+    with patch("void_workers.model.cuda_available", return_value=True):
+        with patch("faster_whisper.WhisperModel", side_effect=fake_ctor):
+            got = model.whisper_model()
+    assert got is cpu_model
+
+
 def test_transcribe_returns_joined_segments(monkeypatch):
    seg1 = MagicMock(text=" Hello world ")
    seg2 = MagicMock(text=" second line")