from unittest.mock import patch, call from void_workers import gpu, config def test_free_unloads_each_loaded_model(monkeypatch): monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", True) calls = [] def fake_http(method, url, body=None, timeout=5): calls.append((method, url, body)) if url.endswith("/api/ps"): # loaded first, then empty after the unloads (confirm-poll) return {"models": [{"name": "llama3.1:8b"}]} if len([c for c in calls if c[1].endswith("/api/ps")]) == 1 else {"models": []} return {} with patch("void_workers.gpu._http", side_effect=fake_http): freed = gpu.free_ollama_vram(base="http://x:11434") assert freed == ["llama3.1:8b"] # an unload POST with keep_alive:0 was issued for the loaded model assert ( "POST", "http://x:11434/api/generate", {"model": "llama3.1:8b", "keep_alive": 0}, ) in calls def test_free_is_noop_when_disabled(monkeypatch): monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", False) with patch("void_workers.gpu._http") as h: assert gpu.free_ollama_vram(base="http://x:11434") == [] h.assert_not_called() def test_free_is_noop_when_nothing_loaded(monkeypatch): monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", True) with patch("void_workers.gpu._http", return_value={"models": []}) as h: assert gpu.free_ollama_vram(base="http://x:11434") == [] # only the /api/ps probe, no unload POST assert all(c.args[0] == "GET" for c in h.call_args_list) def test_free_never_raises_when_ollama_unreachable(monkeypatch): monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", True) with patch("void_workers.gpu._http", side_effect=OSError("connection refused")): # ps fails -> [] -> no unload -> returns [] without propagating assert gpu.free_ollama_vram(base="http://x:11434") == []