Whisper (CT 311) and Ollama (CT 102) share one A2000. Before loading Whisper on CUDA, ask Ollama to unload its models (GET /api/ps then POST /api/generate keep_alive:0) and wait for the card to clear, so the GPU load has headroom. Best-effort and stdlib-only; Ollama reloads cooperatively, and the existing CUDA->CPU fallback covers any failure. Toggle via OLLAMA_FREE_BEFORE_STT; endpoint via OLLAMA_URL. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
54 lines
1.9 KiB
Python
54 lines
1.9 KiB
Python
import os
|
|
from .log import log
|
|
|
|
_whisper_model = None
|
|
|
|
|
|
def cuda_available():
|
|
try:
|
|
import ctranslate2
|
|
return ctranslate2.get_cuda_device_count() > 0
|
|
except Exception as e:
|
|
log.info("ctranslate2_cuda_probe_failed", err=str(e))
|
|
return False
|
|
|
|
|
|
def _load_whisper(device, compute_type):
|
|
from faster_whisper import WhisperModel
|
|
name = os.environ.get("WHISPER_MODEL", "small.en")
|
|
cache = os.environ.get("WHISPER_CACHE", "/var/lib/void/whisper-models")
|
|
log.info("whisper_loading", model=name, device=device,
|
|
compute_type=compute_type, cache=cache)
|
|
return WhisperModel(
|
|
name, device=device, compute_type=compute_type, download_root=cache
|
|
)
|
|
|
|
|
|
def whisper_model():
|
|
global _whisper_model
|
|
if _whisper_model is None:
|
|
# Prefer the GPU when present, but fall back to CPU if the GPU is
|
|
# absent OR unusable at load time (e.g. VRAM already exhausted by
|
|
# another process sharing the card). HA portability + a shared GPU
|
|
# mean this must degrade gracefully, never hard-fail a transcription.
|
|
if cuda_available():
|
|
# Make room on the shared GPU first (best-effort; never raises).
|
|
try:
|
|
from . import gpu
|
|
gpu.free_ollama_vram()
|
|
except Exception as e:
|
|
log.info("ollama_free_skipped", err=str(e))
|
|
try:
|
|
_whisper_model = _load_whisper("cuda", "float16")
|
|
except Exception as e:
|
|
log.warning("whisper_cuda_load_failed_fallback_cpu", err=str(e))
|
|
_whisper_model = None
|
|
if _whisper_model is None:
|
|
_whisper_model = _load_whisper("cpu", "int8")
|
|
return _whisper_model
|
|
|
|
|
|
def whisper_transcribe(audio_path):
|
|
segments, _info = whisper_model().transcribe(audio_path, vad_filter=True)
|
|
return "\n".join(s.text.strip() for s in segments).strip()
|