Files
Void-Homelab/workers/tests/test_gpu.py
root a9191cee00 feat(workers): free Ollama VRAM before loading Whisper on the GPU
Whisper (CT 311) and Ollama (CT 102) share one A2000. Before loading
Whisper on CUDA, ask Ollama to unload its models (GET /api/ps then POST
/api/generate keep_alive:0) and wait for the card to clear, so the GPU
load has headroom. Best-effort and stdlib-only; Ollama reloads
cooperatively, and the existing CUDA->CPU fallback covers any failure.
Toggle via OLLAMA_FREE_BEFORE_STT; endpoint via OLLAMA_URL.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 21:12:05 +10:00

48 lines
1.9 KiB
Python

from unittest.mock import patch, call
from void_workers import gpu, config
def test_free_unloads_each_loaded_model(monkeypatch):
monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", True)
calls = []
def fake_http(method, url, body=None, timeout=5):
calls.append((method, url, body))
if url.endswith("/api/ps"):
# loaded first, then empty after the unloads (confirm-poll)
return {"models": [{"name": "llama3.1:8b"}]} if len([c for c in calls if c[1].endswith("/api/ps")]) == 1 else {"models": []}
return {}
with patch("void_workers.gpu._http", side_effect=fake_http):
freed = gpu.free_ollama_vram(base="http://x:11434")
assert freed == ["llama3.1:8b"]
# an unload POST with keep_alive:0 was issued for the loaded model
assert (
"POST",
"http://x:11434/api/generate",
{"model": "llama3.1:8b", "keep_alive": 0},
) in calls
def test_free_is_noop_when_disabled(monkeypatch):
monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", False)
with patch("void_workers.gpu._http") as h:
assert gpu.free_ollama_vram(base="http://x:11434") == []
h.assert_not_called()
def test_free_is_noop_when_nothing_loaded(monkeypatch):
monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", True)
with patch("void_workers.gpu._http", return_value={"models": []}) as h:
assert gpu.free_ollama_vram(base="http://x:11434") == []
# only the /api/ps probe, no unload POST
assert all(c.args[0] == "GET" for c in h.call_args_list)
def test_free_never_raises_when_ollama_unreachable(monkeypatch):
monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", True)
with patch("void_workers.gpu._http", side_effect=OSError("connection refused")):
# ps fails -> [] -> no unload -> returns [] without propagating
assert gpu.free_ollama_vram(base="http://x:11434") == []