Files
Void-Homelab/workers/void_workers/config.py
root a9191cee00 feat(workers): free Ollama VRAM before loading Whisper on the GPU
Whisper (CT 311) and Ollama (CT 102) share one A2000. Before loading
Whisper on CUDA, ask Ollama to unload its models (GET /api/ps then POST
/api/generate keep_alive:0) and wait for the card to clear, so the GPU
load has headroom. Best-effort and stdlib-only; Ollama reloads
cooperatively, and the existing CUDA->CPU fallback covers any failure.
Toggle via OLLAMA_FREE_BEFORE_STT; endpoint via OLLAMA_URL.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-05 21:12:05 +10:00

33 lines
1.3 KiB
Python

import os
def env(name, default=None, required=False):
v = os.environ.get(name, default)
if required and v is None:
raise RuntimeError(f"env {name} is required")
return v
def env_int(name, default):
return int(os.environ.get(name, default))
DATABASE_URL = env("DATABASE_URL", required=True)
BLOB_ROOT = env("BLOB_ROOT", "/var/lib/void/blobs")
WHISPER_MODEL = env("WHISPER_MODEL", "small.en")
WHISPER_CACHE = env("WHISPER_CACHE", "/var/lib/void/whisper-models")
ALLOW_PRIVATE = env("VOID_INGEST_ALLOW_PRIVATE", "false") == "true"
# GPU sharing: Whisper and Ollama (CT 102) share one A2000. Before loading
# Whisper on the GPU, ask Ollama to unload its models to make room (it reloads
# cooperatively on its next request). Best-effort; CPU fallback covers failure.
OLLAMA_URL = env("OLLAMA_URL", "http://192.168.1.185:11434")
OLLAMA_FREE_BEFORE_STT = env("OLLAMA_FREE_BEFORE_STT", "true") == "true"
CONCURRENCY = {
"extract.pdf": env_int("VOID_CONCURRENCY_EXTRACT_PDF", 2),
"extract.image": env_int("VOID_CONCURRENCY_EXTRACT_IMAGE", 2),
"ingest.video": env_int("VOID_CONCURRENCY_INGEST_VIDEO", 1),
"sync.source_doc": env_int("VOID_CONCURRENCY_SYNC_SOURCE_DOC", 1),
"echo": env_int("VOID_CONCURRENCY_ECHO", 1),
}
POLL_INTERVAL_MS = env_int("VOID_POLL_INTERVAL_MS", 1000)