Files
Void-Homelab/workers/void_workers/model.py
root 3c028fed5a fix(workers): graceful GPU→CPU fallback for Whisper at load time
cuda_available() only covers "no GPU present". On a shared card the GPU
can exist but fail to load the model (VRAM exhausted by another process
e.g. Ollama). Try CUDA first, fall back to a CPU model on any load
error instead of crashing the transcription job. Supports HA portability
(node without GPU) and a contended GPU. Adds GPU-path + fallback tests.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 08:04:14 +10:00

48 lines
1.6 KiB
Python

import os
from .log import log
_whisper_model = None
def cuda_available():
try:
import ctranslate2
return ctranslate2.get_cuda_device_count() > 0
except Exception as e:
log.info("ctranslate2_cuda_probe_failed", err=str(e))
return False
def _load_whisper(device, compute_type):
from faster_whisper import WhisperModel
name = os.environ.get("WHISPER_MODEL", "small.en")
cache = os.environ.get("WHISPER_CACHE", "/var/lib/void/whisper-models")
log.info("whisper_loading", model=name, device=device,
compute_type=compute_type, cache=cache)
return WhisperModel(
name, device=device, compute_type=compute_type, download_root=cache
)
def whisper_model():
global _whisper_model
if _whisper_model is None:
# Prefer the GPU when present, but fall back to CPU if the GPU is
# absent OR unusable at load time (e.g. VRAM already exhausted by
# another process sharing the card). HA portability + a shared GPU
# mean this must degrade gracefully, never hard-fail a transcription.
if cuda_available():
try:
_whisper_model = _load_whisper("cuda", "float16")
except Exception as e:
log.warning("whisper_cuda_load_failed_fallback_cpu", err=str(e))
_whisper_model = None
if _whisper_model is None:
_whisper_model = _load_whisper("cpu", "int8")
return _whisper_model
def whisper_transcribe(audio_path):
segments, _info = whisper_model().transcribe(audio_path, vad_filter=True)
return "\n".join(s.text.strip() for s in segments).strip()