Void-Homelab/workers/void_workers/gpu.py

"""Cooperative GPU sharing with Ollama.

Whisper (this worker, CT 311) and Ollama (CT 102) both pass through Z's single
RTX A2000. Before Whisper loads on the GPU we ask Ollama to unload its models so
there's room; Ollama transparently reloads on its next request. Everything here
is best-effort and never raises — if Ollama is unreachable or slow, Whisper
still tries the GPU and falls back to CPU (see model.py).

Stdlib urllib only (the workers carry no `requests`/`httpx` dependency).
"""
import json
import time
import urllib.request

from .log import log
from . import config


def _http(method, url, body=None, timeout=5):
    data = json.dumps(body).encode() if body is not None else None
    req = urllib.request.Request(
        url, data=data, method=method,
        headers={"Content-Type": "application/json"},
    )
    with urllib.request.urlopen(req, timeout=timeout) as r:
        raw = r.read().decode()
    return json.loads(raw) if raw else {}


def loaded_ollama_models(base=None, timeout=3):
    """Names of models Ollama currently holds in memory (GET /api/ps)."""
    base = base or config.OLLAMA_URL
    try:
        data = _http("GET", f"{base}/api/ps", timeout=timeout)
        return [m["name"] for m in data.get("models", []) if m.get("name")]
    except Exception as e:
        log.info("ollama_ps_failed", err=str(e))
        return []


def free_ollama_vram(base=None, wait_s=6.0):
    """Ask Ollama to unload its loaded models, then wait (briefly) for the VRAM
    to actually free. Returns the list of models it tried to unload. No-op when
    OLLAMA_FREE_BEFORE_STT is disabled or nothing is loaded. Never raises."""
    if not config.OLLAMA_FREE_BEFORE_STT:
        return []
    base = base or config.OLLAMA_URL
    models = loaded_ollama_models(base)
    if not models:
        return []
    # keep_alive:0 tells Ollama to drop the model from memory immediately.
    for name in models:
        try:
            _http("POST", f"{base}/api/generate",
                  {"model": name, "keep_alive": 0}, timeout=8)
        except Exception as e:
            log.info("ollama_unload_failed", model=name, err=str(e))
    # Confirm the card is actually clear before we hand it to Whisper.
    deadline = time.monotonic() + wait_s
    while time.monotonic() < deadline:
        if not loaded_ollama_models(base):
            break
        time.sleep(0.3)
    log.info("ollama_vram_freed", models=models)
    return models