from unittest.mock import patch, call
from void_workers import gpu, config


def test_free_unloads_each_loaded_model(monkeypatch):
    monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", True)
    calls = []

    def fake_http(method, url, body=None, timeout=5):
        calls.append((method, url, body))
        if url.endswith("/api/ps"):
            # loaded first, then empty after the unloads (confirm-poll)
            return {"models": [{"name": "llama3.1:8b"}]} if len([c for c in calls if c[1].endswith("/api/ps")]) == 1 else {"models": []}
        return {}

    with patch("void_workers.gpu._http", side_effect=fake_http):
        freed = gpu.free_ollama_vram(base="http://x:11434")

    assert freed == ["llama3.1:8b"]
    # an unload POST with keep_alive:0 was issued for the loaded model
    assert (
        "POST",
        "http://x:11434/api/generate",
        {"model": "llama3.1:8b", "keep_alive": 0},
    ) in calls


def test_free_is_noop_when_disabled(monkeypatch):
    monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", False)
    with patch("void_workers.gpu._http") as h:
        assert gpu.free_ollama_vram(base="http://x:11434") == []
        h.assert_not_called()


def test_free_is_noop_when_nothing_loaded(monkeypatch):
    monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", True)
    with patch("void_workers.gpu._http", return_value={"models": []}) as h:
        assert gpu.free_ollama_vram(base="http://x:11434") == []
        # only the /api/ps probe, no unload POST
        assert all(c.args[0] == "GET" for c in h.call_args_list)


def test_free_never_raises_when_ollama_unreachable(monkeypatch):
    monkeypatch.setattr(config, "OLLAMA_FREE_BEFORE_STT", True)
    with patch("void_workers.gpu._http", side_effect=OSError("connection refused")):
        # ps fails -> [] -> no unload -> returns [] without propagating
        assert gpu.free_ollama_vram(base="http://x:11434") == []