Whisper (CT 311) and Ollama (CT 102) share one A2000. Before loading Whisper on CUDA, ask Ollama to unload its models (GET /api/ps then POST /api/generate keep_alive:0) and wait for the card to clear, so the GPU load has headroom. Best-effort and stdlib-only; Ollama reloads cooperatively, and the existing CUDA->CPU fallback covers any failure. Toggle via OLLAMA_FREE_BEFORE_STT; endpoint via OLLAMA_URL. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
48 lines
1.9 KiB
Python
48 lines
1.9 KiB
Python
from unittest.mock import patch, call
|
|
from void_workers import gpu, config
|
|
|
|
|
|
def test_free_unloads_each_loaded_model(monkeypatch):
|
|
monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", True)
|
|
calls = []
|
|
|
|
def fake_http(method, url, body=None, timeout=5):
|
|
calls.append((method, url, body))
|
|
if url.endswith("/api/ps"):
|
|
# loaded first, then empty after the unloads (confirm-poll)
|
|
return {"models": [{"name": "llama3.1:8b"}]} if len([c for c in calls if c[1].endswith("/api/ps")]) == 1 else {"models": []}
|
|
return {}
|
|
|
|
with patch("void_workers.gpu._http", side_effect=fake_http):
|
|
freed = gpu.free_ollama_vram(base="http://x:11434")
|
|
|
|
assert freed == ["llama3.1:8b"]
|
|
# an unload POST with keep_alive:0 was issued for the loaded model
|
|
assert (
|
|
"POST",
|
|
"http://x:11434/api/generate",
|
|
{"model": "llama3.1:8b", "keep_alive": 0},
|
|
) in calls
|
|
|
|
|
|
def test_free_is_noop_when_disabled(monkeypatch):
|
|
monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", False)
|
|
with patch("void_workers.gpu._http") as h:
|
|
assert gpu.free_ollama_vram(base="http://x:11434") == []
|
|
h.assert_not_called()
|
|
|
|
|
|
def test_free_is_noop_when_nothing_loaded(monkeypatch):
|
|
monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", True)
|
|
with patch("void_workers.gpu._http", return_value={"models": []}) as h:
|
|
assert gpu.free_ollama_vram(base="http://x:11434") == []
|
|
# only the /api/ps probe, no unload POST
|
|
assert all(c.args[0] == "GET" for c in h.call_args_list)
|
|
|
|
|
|
def test_free_never_raises_when_ollama_unreachable(monkeypatch):
|
|
monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", True)
|
|
with patch("void_workers.gpu._http", side_effect=OSError("connection refused")):
|
|
# ps fails -> [] -> no unload -> returns [] without propagating
|
|
assert gpu.free_ollama_vram(base="http://x:11434") == []
|