Files
Void-Homelab/workers/tests/test_model.py
root a9191cee00 feat(workers): free Ollama VRAM before loading Whisper on the GPU
Whisper (CT 311) and Ollama (CT 102) share one A2000. Before loading
Whisper on CUDA, ask Ollama to unload its models (GET /api/ps then POST
/api/generate keep_alive:0) and wait for the card to clear, so the GPU
load has headroom. Best-effort and stdlib-only; Ollama reloads
cooperatively, and the existing CUDA->CPU fallback covers any failure.
Toggle via OLLAMA_FREE_BEFORE_STT; endpoint via OLLAMA_URL.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 21:12:05 +10:00

65 lines
2.7 KiB
Python

from unittest.mock import patch, MagicMock
from void_workers import model
def test_model_returns_singleton(monkeypatch):
m = MagicMock()
monkeypatch.setattr(model, "_whisper_model", None)
with patch("void_workers.model.cuda_available", return_value=False):
with patch("faster_whisper.WhisperModel", return_value=m):
a = model.whisper_model()
b = model.whisper_model()
assert a is b
def test_uses_gpu_when_available(monkeypatch):
monkeypatch.setattr(model, "_whisper_model", None)
with patch("void_workers.model.cuda_available", return_value=True):
with patch("void_workers.gpu.free_ollama_vram", return_value=[]):
with patch("faster_whisper.WhisperModel", return_value=MagicMock()) as WM:
model.whisper_model()
assert WM.call_args.kwargs["device"] == "cuda"
assert WM.call_args.kwargs["compute_type"] == "float16"
def test_frees_ollama_before_gpu_load(monkeypatch):
# Ollama VRAM must be freed BEFORE the cuda model is constructed.
monkeypatch.setattr(model, "_whisper_model", None)
order = []
with patch("void_workers.model.cuda_available", return_value=True):
with patch("void_workers.gpu.free_ollama_vram",
side_effect=lambda *a, **k: order.append("free")):
with patch("faster_whisper.WhisperModel",
side_effect=lambda *a, **k: order.append("load") or MagicMock()):
model.whisper_model()
assert order == ["free", "load"]
def test_falls_back_to_cpu_when_cuda_load_fails(monkeypatch):
# GPU is present but the model fails to load (e.g. VRAM exhausted): must
# not raise — fall back to a CPU model instead of crashing the job.
monkeypatch.setattr(model, "_whisper_model", None)
cpu_model = MagicMock()
def fake_ctor(name, device, compute_type, download_root):
if device == "cuda":
raise RuntimeError("CUDA failed to allocate memory")
return cpu_model
with patch("void_workers.model.cuda_available", return_value=True):
with patch("void_workers.gpu.free_ollama_vram", return_value=[]):
with patch("faster_whisper.WhisperModel", side_effect=fake_ctor):
got = model.whisper_model()
assert got is cpu_model
def test_transcribe_returns_joined_segments(monkeypatch):
seg1 = MagicMock(text=" Hello world ")
seg2 = MagicMock(text=" second line")
fake_model = MagicMock()
fake_model.transcribe.return_value = ([seg1, seg2], MagicMock())
monkeypatch.setattr(model, "_whisper_model", fake_model)
out = model.whisper_transcribe("/tmp/whatever.opus")
assert "Hello world" in out
assert "second line" in out