Whisper (CT 311) and Ollama (CT 102) share one A2000. Before loading Whisper on CUDA, ask Ollama to unload its models (GET /api/ps then POST /api/generate keep_alive:0) and wait for the card to clear, so the GPU load has headroom. Best-effort and stdlib-only; Ollama reloads cooperatively, and the existing CUDA->CPU fallback covers any failure. Toggle via OLLAMA_FREE_BEFORE_STT; endpoint via OLLAMA_URL. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
66 lines
2.4 KiB
Python
66 lines
2.4 KiB
Python
"""Cooperative GPU sharing with Ollama.
|
|
|
|
Whisper (this worker, CT 311) and Ollama (CT 102) both pass through Z's single
|
|
RTX A2000. Before Whisper loads on the GPU we ask Ollama to unload its models so
|
|
there's room; Ollama transparently reloads on its next request. Everything here
|
|
is best-effort and never raises — if Ollama is unreachable or slow, Whisper
|
|
still tries the GPU and falls back to CPU (see model.py).
|
|
|
|
Stdlib urllib only (the workers carry no `requests`/`httpx` dependency).
|
|
"""
|
|
import json
|
|
import time
|
|
import urllib.request
|
|
|
|
from .log import log
|
|
from . import config
|
|
|
|
|
|
def _http(method, url, body=None, timeout=5):
|
|
data = json.dumps(body).encode() if body is not None else None
|
|
req = urllib.request.Request(
|
|
url, data=data, method=method,
|
|
headers={"Content-Type": "application/json"},
|
|
)
|
|
with urllib.request.urlopen(req, timeout=timeout) as r:
|
|
raw = r.read().decode()
|
|
return json.loads(raw) if raw else {}
|
|
|
|
|
|
def loaded_ollama_models(base=None, timeout=3):
|
|
"""Names of models Ollama currently holds in memory (GET /api/ps)."""
|
|
base = base or config.OLLAMA_URL
|
|
try:
|
|
data = _http("GET", f"{base}/api/ps", timeout=timeout)
|
|
return [m["name"] for m in data.get("models", []) if m.get("name")]
|
|
except Exception as e:
|
|
log.info("ollama_ps_failed", err=str(e))
|
|
return []
|
|
|
|
|
|
def free_ollama_vram(base=None, wait_s=6.0):
|
|
"""Ask Ollama to unload its loaded models, then wait (briefly) for the VRAM
|
|
to actually free. Returns the list of models it tried to unload. No-op when
|
|
OLLAMA_FREE_BEFORE_STT is disabled or nothing is loaded. Never raises."""
|
|
if not config.OLLAMA_FREE_BEFORE_STT:
|
|
return []
|
|
base = base or config.OLLAMA_URL
|
|
models = loaded_ollama_models(base)
|
|
if not models:
|
|
return []
|
|
# keep_alive:0 tells Ollama to drop the model from memory immediately.
|
|
for name in models:
|
|
try:
|
|
_http("POST", f"{base}/api/generate",
|
|
{"model": name, "keep_alive": 0}, timeout=8)
|
|
except Exception as e:
|
|
log.info("ollama_unload_failed", model=name, err=str(e))
|
|
# Confirm the card is actually clear before we hand it to Whisper.
|
|
deadline = time.monotonic() + wait_s
|
|
while time.monotonic() < deadline:
|
|
if not loaded_ollama_models(base):
|
|
break
|
|
time.sleep(0.3)
|
|
log.info("ollama_vram_freed", models=models)
|
|
return models
|