import os from .log import log _whisper_model = None def cuda_available(): try: import ctranslate2 return ctranslate2.get_cuda_device_count() > 0 except Exception as e: log.info("ctranslate2_cuda_probe_failed", err=str(e)) return False def _load_whisper(device, compute_type): from faster_whisper import WhisperModel name = os.environ.get("WHISPER_MODEL", "small.en") cache = os.environ.get("WHISPER_CACHE", "/var/lib/void/whisper-models") log.info("whisper_loading", model=name, device=device, compute_type=compute_type, cache=cache) return WhisperModel( name, device=device, compute_type=compute_type, download_root=cache ) def whisper_model(): global _whisper_model if _whisper_model is None: # Prefer the GPU when present, but fall back to CPU if the GPU is # absent OR unusable at load time (e.g. VRAM already exhausted by # another process sharing the card). HA portability + a shared GPU # mean this must degrade gracefully, never hard-fail a transcription. if cuda_available(): # Make room on the shared GPU first (best-effort; never raises). try: from . import gpu gpu.free_ollama_vram() except Exception as e: log.info("ollama_free_skipped", err=str(e)) try: _whisper_model = _load_whisper("cuda", "float16") except Exception as e: log.warning("whisper_cuda_load_failed_fallback_cpu", err=str(e)) _whisper_model = None if _whisper_model is None: _whisper_model = _load_whisper("cpu", "int8") return _whisper_model def whisper_transcribe(audio_path): segments, _info = whisper_model().transcribe(audio_path, vad_filter=True) return "\n".join(s.text.strip() for s in segments).strip()