diff --git a/workers/tests/test_model.py b/workers/tests/test_model.py index 250a4ae..a683258 100644 --- a/workers/tests/test_model.py +++ b/workers/tests/test_model.py @@ -12,6 +12,32 @@ def test_model_returns_singleton(monkeypatch): assert a is b +def test_uses_gpu_when_available(monkeypatch): + monkeypatch.setattr(model, "_whisper_model", None) + with patch("void_workers.model.cuda_available", return_value=True): + with patch("faster_whisper.WhisperModel", return_value=MagicMock()) as WM: + model.whisper_model() + assert WM.call_args.kwargs["device"] == "cuda" + assert WM.call_args.kwargs["compute_type"] == "float16" + + +def test_falls_back_to_cpu_when_cuda_load_fails(monkeypatch): + # GPU is present but the model fails to load (e.g. VRAM exhausted): must + # not raise — fall back to a CPU model instead of crashing the job. + monkeypatch.setattr(model, "_whisper_model", None) + cpu_model = MagicMock() + + def fake_ctor(name, device, compute_type, download_root): + if device == "cuda": + raise RuntimeError("CUDA failed to allocate memory") + return cpu_model + + with patch("void_workers.model.cuda_available", return_value=True): + with patch("faster_whisper.WhisperModel", side_effect=fake_ctor): + got = model.whisper_model() + assert got is cpu_model + + def test_transcribe_returns_joined_segments(monkeypatch): seg1 = MagicMock(text=" Hello world ") seg2 = MagicMock(text=" second line") diff --git a/workers/void_workers/model.py b/workers/void_workers/model.py index 0212671..727bf60 100644 --- a/workers/void_workers/model.py +++ b/workers/void_workers/model.py @@ -13,19 +13,32 @@ def cuda_available(): return False +def _load_whisper(device, compute_type): + from faster_whisper import WhisperModel + name = os.environ.get("WHISPER_MODEL", "small.en") + cache = os.environ.get("WHISPER_CACHE", "/var/lib/void/whisper-models") + log.info("whisper_loading", model=name, device=device, + compute_type=compute_type, cache=cache) + return WhisperModel( + name, device=device, compute_type=compute_type, download_root=cache + ) + + def whisper_model(): global _whisper_model if _whisper_model is None: - from faster_whisper import WhisperModel - name = os.environ.get("WHISPER_MODEL", "small.en") - cache = os.environ.get("WHISPER_CACHE", "/var/lib/void/whisper-models") - device = "cuda" if cuda_available() else "cpu" - compute_type = "float16" if device == "cuda" else "int8" - log.info("whisper_loading", model=name, device=device, - compute_type=compute_type, cache=cache) - _whisper_model = WhisperModel( - name, device=device, compute_type=compute_type, download_root=cache - ) + # Prefer the GPU when present, but fall back to CPU if the GPU is + # absent OR unusable at load time (e.g. VRAM already exhausted by + # another process sharing the card). HA portability + a shared GPU + # mean this must degrade gracefully, never hard-fail a transcription. + if cuda_available(): + try: + _whisper_model = _load_whisper("cuda", "float16") + except Exception as e: + log.warning("whisper_cuda_load_failed_fallback_cpu", err=str(e)) + _whisper_model = None + if _whisper_model is None: + _whisper_model = _load_whisper("cpu", "int8") return _whisper_model