Whisper (CT 311) and Ollama (CT 102) share one A2000. Before loading Whisper on CUDA, ask Ollama to unload its models (GET /api/ps then POST /api/generate keep_alive:0) and wait for the card to clear, so the GPU load has headroom. Best-effort and stdlib-only; Ollama reloads cooperatively, and the existing CUDA->CPU fallback covers any failure. Toggle via OLLAMA_FREE_BEFORE_STT; endpoint via OLLAMA_URL. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
65 lines
2.7 KiB
Python
65 lines
2.7 KiB
Python
from unittest.mock import patch, MagicMock
|
|
from void_workers import model
|
|
|
|
|
|
def test_model_returns_singleton(monkeypatch):
|
|
m = MagicMock()
|
|
monkeypatch.setattr(model, "_whisper_model", None)
|
|
with patch("void_workers.model.cuda_available", return_value=False):
|
|
with patch("faster_whisper.WhisperModel", return_value=m):
|
|
a = model.whisper_model()
|
|
b = model.whisper_model()
|
|
assert a is b
|
|
|
|
|
|
def test_uses_gpu_when_available(monkeypatch):
|
|
monkeypatch.setattr(model, "_whisper_model", None)
|
|
with patch("void_workers.model.cuda_available", return_value=True):
|
|
with patch("void_workers.gpu.free_ollama_vram", return_value=[]):
|
|
with patch("faster_whisper.WhisperModel", return_value=MagicMock()) as WM:
|
|
model.whisper_model()
|
|
assert WM.call_args.kwargs["device"] == "cuda"
|
|
assert WM.call_args.kwargs["compute_type"] == "float16"
|
|
|
|
|
|
def test_frees_ollama_before_gpu_load(monkeypatch):
|
|
# Ollama VRAM must be freed BEFORE the cuda model is constructed.
|
|
monkeypatch.setattr(model, "_whisper_model", None)
|
|
order = []
|
|
with patch("void_workers.model.cuda_available", return_value=True):
|
|
with patch("void_workers.gpu.free_ollama_vram",
|
|
side_effect=lambda *a, **k: order.append("free")):
|
|
with patch("faster_whisper.WhisperModel",
|
|
side_effect=lambda *a, **k: order.append("load") or MagicMock()):
|
|
model.whisper_model()
|
|
assert order == ["free", "load"]
|
|
|
|
|
|
def test_falls_back_to_cpu_when_cuda_load_fails(monkeypatch):
|
|
# GPU is present but the model fails to load (e.g. VRAM exhausted): must
|
|
# not raise — fall back to a CPU model instead of crashing the job.
|
|
monkeypatch.setattr(model, "_whisper_model", None)
|
|
cpu_model = MagicMock()
|
|
|
|
def fake_ctor(name, device, compute_type, download_root):
|
|
if device == "cuda":
|
|
raise RuntimeError("CUDA failed to allocate memory")
|
|
return cpu_model
|
|
|
|
with patch("void_workers.model.cuda_available", return_value=True):
|
|
with patch("void_workers.gpu.free_ollama_vram", return_value=[]):
|
|
with patch("faster_whisper.WhisperModel", side_effect=fake_ctor):
|
|
got = model.whisper_model()
|
|
assert got is cpu_model
|
|
|
|
|
|
def test_transcribe_returns_joined_segments(monkeypatch):
|
|
seg1 = MagicMock(text=" Hello world ")
|
|
seg2 = MagicMock(text=" second line")
|
|
fake_model = MagicMock()
|
|
fake_model.transcribe.return_value = ([seg1, seg2], MagicMock())
|
|
monkeypatch.setattr(model, "_whisper_model", fake_model)
|
|
out = model.whisper_transcribe("/tmp/whatever.opus")
|
|
assert "Hello world" in out
|
|
assert "second line" in out
|