Whisper (CT 311) and Ollama (CT 102) share one A2000. Before loading Whisper on CUDA, ask Ollama to unload its models (GET /api/ps then POST /api/generate keep_alive:0) and wait for the card to clear, so the GPU load has headroom. Best-effort and stdlib-only; Ollama reloads cooperatively, and the existing CUDA->CPU fallback covers any failure. Toggle via OLLAMA_FREE_BEFORE_STT; endpoint via OLLAMA_URL. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
33 lines
1.3 KiB
Python
33 lines
1.3 KiB
Python
import os
|
|
|
|
def env(name, default=None, required=False):
|
|
v = os.environ.get(name, default)
|
|
if required and v is None:
|
|
raise RuntimeError(f"env {name} is required")
|
|
return v
|
|
|
|
def env_int(name, default):
|
|
return int(os.environ.get(name, default))
|
|
|
|
DATABASE_URL = env("DATABASE_URL", required=True)
|
|
BLOB_ROOT = env("BLOB_ROOT", "/var/lib/void/blobs")
|
|
WHISPER_MODEL = env("WHISPER_MODEL", "small.en")
|
|
WHISPER_CACHE = env("WHISPER_CACHE", "/var/lib/void/whisper-models")
|
|
ALLOW_PRIVATE = env("VOID_INGEST_ALLOW_PRIVATE", "false") == "true"
|
|
|
|
# GPU sharing: Whisper and Ollama (CT 102) share one A2000. Before loading
|
|
# Whisper on the GPU, ask Ollama to unload its models to make room (it reloads
|
|
# cooperatively on its next request). Best-effort; CPU fallback covers failure.
|
|
OLLAMA_URL = env("OLLAMA_URL", "http://192.168.1.185:11434")
|
|
OLLAMA_FREE_BEFORE_STT = env("OLLAMA_FREE_BEFORE_STT", "true") == "true"
|
|
|
|
CONCURRENCY = {
|
|
"extract.pdf": env_int("VOID_CONCURRENCY_EXTRACT_PDF", 2),
|
|
"extract.image": env_int("VOID_CONCURRENCY_EXTRACT_IMAGE", 2),
|
|
"ingest.video": env_int("VOID_CONCURRENCY_INGEST_VIDEO", 1),
|
|
"sync.source_doc": env_int("VOID_CONCURRENCY_SYNC_SOURCE_DOC", 1),
|
|
"echo": env_int("VOID_CONCURRENCY_ECHO", 1),
|
|
}
|
|
|
|
POLL_INTERVAL_MS = env_int("VOID_POLL_INTERVAL_MS", 1000)
|