Void-Homelab/workers/void_workers/model.py

import os
from .log import log

_whisper_model = None


def cuda_available():
    try:
        import ctranslate2
        return ctranslate2.get_cuda_device_count() > 0
    except Exception as e:
        log.info("ctranslate2_cuda_probe_failed", err=str(e))
        return False


def _load_whisper(device, compute_type):
    from faster_whisper import WhisperModel
    name = os.environ.get("WHISPER_MODEL", "small.en")
    cache = os.environ.get("WHISPER_CACHE", "/var/lib/void/whisper-models")
    log.info("whisper_loading", model=name, device=device,
             compute_type=compute_type, cache=cache)
    return WhisperModel(
        name, device=device, compute_type=compute_type, download_root=cache
    )


def whisper_model():
    global _whisper_model
    if _whisper_model is None:
        # Prefer the GPU when present, but fall back to CPU if the GPU is
        # absent OR unusable at load time (e.g. VRAM already exhausted by
        # another process sharing the card). HA portability + a shared GPU
        # mean this must degrade gracefully, never hard-fail a transcription.
        if cuda_available():
            # Make room on the shared GPU first (best-effort; never raises).
            try:
                from . import gpu
                gpu.free_ollama_vram()
            except Exception as e:
                log.info("ollama_free_skipped", err=str(e))
            try:
                _whisper_model = _load_whisper("cuda", "float16")
            except Exception as e:
                log.warning("whisper_cuda_load_failed_fallback_cpu", err=str(e))
                _whisper_model = None
        if _whisper_model is None:
            _whisper_model = _load_whisper("cpu", "int8")
    return _whisper_model


def whisper_transcribe(audio_path):
    segments, _info = whisper_model().transcribe(audio_path, vad_filter=True)
    return "\n".join(s.text.strip() for s in segments).strip()