fix(workers): graceful GPU→CPU fallback for Whisper at load time
cuda_available() only covers "no GPU present". On a shared card the GPU can exist but fail to load the model (VRAM exhausted by another process e.g. Ollama). Try CUDA first, fall back to a CPU model on any load error instead of crashing the transcription job. Supports HA portability (node without GPU) and a contended GPU. Adds GPU-path + fallback tests. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,32 @@ def test_model_returns_singleton(monkeypatch):
|
||||
assert a is b
|
||||
|
||||
|
||||
def test_uses_gpu_when_available(monkeypatch):
|
||||
monkeypatch.setattr(model, "_whisper_model", None)
|
||||
with patch("void_workers.model.cuda_available", return_value=True):
|
||||
with patch("faster_whisper.WhisperModel", return_value=MagicMock()) as WM:
|
||||
model.whisper_model()
|
||||
assert WM.call_args.kwargs["device"] == "cuda"
|
||||
assert WM.call_args.kwargs["compute_type"] == "float16"
|
||||
|
||||
|
||||
def test_falls_back_to_cpu_when_cuda_load_fails(monkeypatch):
|
||||
# GPU is present but the model fails to load (e.g. VRAM exhausted): must
|
||||
# not raise — fall back to a CPU model instead of crashing the job.
|
||||
monkeypatch.setattr(model, "_whisper_model", None)
|
||||
cpu_model = MagicMock()
|
||||
|
||||
def fake_ctor(name, device, compute_type, download_root):
|
||||
if device == "cuda":
|
||||
raise RuntimeError("CUDA failed to allocate memory")
|
||||
return cpu_model
|
||||
|
||||
with patch("void_workers.model.cuda_available", return_value=True):
|
||||
with patch("faster_whisper.WhisperModel", side_effect=fake_ctor):
|
||||
got = model.whisper_model()
|
||||
assert got is cpu_model
|
||||
|
||||
|
||||
def test_transcribe_returns_joined_segments(monkeypatch):
|
||||
seg1 = MagicMock(text=" Hello world ")
|
||||
seg2 = MagicMock(text=" second line")
|
||||
|
||||
@@ -13,19 +13,32 @@ def cuda_available():
|
||||
return False
|
||||
|
||||
|
||||
def _load_whisper(device, compute_type):
|
||||
from faster_whisper import WhisperModel
|
||||
name = os.environ.get("WHISPER_MODEL", "small.en")
|
||||
cache = os.environ.get("WHISPER_CACHE", "/var/lib/void/whisper-models")
|
||||
log.info("whisper_loading", model=name, device=device,
|
||||
compute_type=compute_type, cache=cache)
|
||||
return WhisperModel(
|
||||
name, device=device, compute_type=compute_type, download_root=cache
|
||||
)
|
||||
|
||||
|
||||
def whisper_model():
|
||||
global _whisper_model
|
||||
if _whisper_model is None:
|
||||
from faster_whisper import WhisperModel
|
||||
name = os.environ.get("WHISPER_MODEL", "small.en")
|
||||
cache = os.environ.get("WHISPER_CACHE", "/var/lib/void/whisper-models")
|
||||
device = "cuda" if cuda_available() else "cpu"
|
||||
compute_type = "float16" if device == "cuda" else "int8"
|
||||
log.info("whisper_loading", model=name, device=device,
|
||||
compute_type=compute_type, cache=cache)
|
||||
_whisper_model = WhisperModel(
|
||||
name, device=device, compute_type=compute_type, download_root=cache
|
||||
)
|
||||
# Prefer the GPU when present, but fall back to CPU if the GPU is
|
||||
# absent OR unusable at load time (e.g. VRAM already exhausted by
|
||||
# another process sharing the card). HA portability + a shared GPU
|
||||
# mean this must degrade gracefully, never hard-fail a transcription.
|
||||
if cuda_available():
|
||||
try:
|
||||
_whisper_model = _load_whisper("cuda", "float16")
|
||||
except Exception as e:
|
||||
log.warning("whisper_cuda_load_failed_fallback_cpu", err=str(e))
|
||||
_whisper_model = None
|
||||
if _whisper_model is None:
|
||||
_whisper_model = _load_whisper("cpu", "int8")
|
||||
return _whisper_model
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user