Files
Void-Homelab/workers/void_workers/model.py
root a9191cee00 feat(workers): free Ollama VRAM before loading Whisper on the GPU
Whisper (CT 311) and Ollama (CT 102) share one A2000. Before loading
Whisper on CUDA, ask Ollama to unload its models (GET /api/ps then POST
/api/generate keep_alive:0) and wait for the card to clear, so the GPU
load has headroom. Best-effort and stdlib-only; Ollama reloads
cooperatively, and the existing CUDA->CPU fallback covers any failure.
Toggle via OLLAMA_FREE_BEFORE_STT; endpoint via OLLAMA_URL.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 21:12:05 +10:00

54 lines
1.9 KiB
Python

import os
from .log import log
_whisper_model = None
def cuda_available():
try:
import ctranslate2
return ctranslate2.get_cuda_device_count() > 0
except Exception as e:
log.info("ctranslate2_cuda_probe_failed", err=str(e))
return False
def _load_whisper(device, compute_type):
from faster_whisper import WhisperModel
name = os.environ.get("WHISPER_MODEL", "small.en")
cache = os.environ.get("WHISPER_CACHE", "/var/lib/void/whisper-models")
log.info("whisper_loading", model=name, device=device,
compute_type=compute_type, cache=cache)
return WhisperModel(
name, device=device, compute_type=compute_type, download_root=cache
)
def whisper_model():
global _whisper_model
if _whisper_model is None:
# Prefer the GPU when present, but fall back to CPU if the GPU is
# absent OR unusable at load time (e.g. VRAM already exhausted by
# another process sharing the card). HA portability + a shared GPU
# mean this must degrade gracefully, never hard-fail a transcription.
if cuda_available():
# Make room on the shared GPU first (best-effort; never raises).
try:
from . import gpu
gpu.free_ollama_vram()
except Exception as e:
log.info("ollama_free_skipped", err=str(e))
try:
_whisper_model = _load_whisper("cuda", "float16")
except Exception as e:
log.warning("whisper_cuda_load_failed_fallback_cpu", err=str(e))
_whisper_model = None
if _whisper_model is None:
_whisper_model = _load_whisper("cpu", "int8")
return _whisper_model
def whisper_transcribe(audio_path):
segments, _info = whisper_model().transcribe(audio_path, vad_filter=True)
return "\n".join(s.text.strip() for s in segments).strip()