feat(sv): Storage · capacity card — ZFS pools, dropped pools, per-CT disk
Read-only Proxmox storage health (same PROXMOX_RO_TOKEN as the cluster card): ZFS pool health+usage, dropped zfspool storages (the donatello/leonardo SATA signal), and per-LXC rootfs fill, with a HEALTHY/WATCH/ATTENTION roll-up. Closes the monitoring gap from the 2026-06-09 audit (C1 + H2 were invisible). Pure normalizeStorage() unit-tested (4 tests). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -34,6 +34,7 @@ import { router as littleblueRouter } from './routes/littleblue.js';
|
||||
import { router as aiUsageRouter } from './routes/ai_usage.js';
|
||||
import { router as infraRouter } from './routes/infra.js';
|
||||
import { router as clusterRouter } from './routes/cluster.js';
|
||||
import { router as storageRouter } from './routes/storage.js';
|
||||
import { router as kuttRouter } from './routes/kutt.js';
|
||||
|
||||
export function mountApi(app) {
|
||||
@@ -50,6 +51,7 @@ export function mountApi(app) {
|
||||
api.use('/actions', actionsRouter);
|
||||
api.use('/infra', infraRouter);
|
||||
api.use('/cluster', clusterRouter);
|
||||
api.use('/storage', storageRouter);
|
||||
api.use('/little-blue', littleblueRouter);
|
||||
api.use('/ai-usage', aiUsageRouter);
|
||||
api.use('/projects', projectsRouter);
|
||||
|
||||
17
lib/api/routes/storage.js
Normal file
17
lib/api/routes/storage.js
Normal file
@@ -0,0 +1,17 @@
|
||||
import { Router } from 'express';
|
||||
import { asyncWrap } from '../errors.js';
|
||||
import { storageHealth } from '../../proxmox/storage.js';
|
||||
|
||||
// Read-only storage/capacity health for the Sacred Valley card. Cached briefly so
|
||||
// multiple polling clients coalesce into one set of PVE calls. Owner or any authed agent.
|
||||
export const router = Router();
|
||||
|
||||
let cache = { at: 0, data: null };
|
||||
const TTL = 15_000;
|
||||
|
||||
router.get('/', asyncWrap(async (_req, res) => {
|
||||
if (cache.data && Date.now() - cache.at < TTL) return res.json(cache.data);
|
||||
const data = await storageHealth();
|
||||
cache = { at: Date.now(), data };
|
||||
res.json(data);
|
||||
}));
|
||||
94
lib/proxmox/storage.js
Normal file
94
lib/proxmox/storage.js
Normal file
@@ -0,0 +1,94 @@
|
||||
import { Agent } from 'undici';
|
||||
|
||||
// Read-only Proxmox storage + capacity health for the Sacred Valley card. Same
|
||||
// PVEAuditor token as the cluster card (PROXMOX_RO_TOKEN). Surfaces the two things
|
||||
// that have actually bitten this homelab and were previously invisible:
|
||||
// 1. a ZFS pool dropping out (the donatello/leonardo SATA-bus incident) — seen as
|
||||
// a zfspool storage whose status is no longer 'available'.
|
||||
// 2. a container rootfs filling up (mediastack hitting 95%) — per-LXC disk/maxdisk.
|
||||
|
||||
let insecure;
|
||||
function tlsDispatcher() {
|
||||
if (process.env.PROXMOX_INSECURE_TLS !== '1') return undefined;
|
||||
insecure ??= new Agent({ connect: { rejectUnauthorized: false } });
|
||||
return insecure;
|
||||
}
|
||||
|
||||
async function pveGet(path, { apiUrl, token, fetchImpl = fetch }) {
|
||||
const res = await fetchImpl(`${apiUrl}/api2/json${path}`, {
|
||||
headers: { Authorization: `PVEAPIToken=${token}` },
|
||||
dispatcher: tlsDispatcher()
|
||||
});
|
||||
if (!res.ok) throw new Error(`pve ${path} -> ${res.status}`);
|
||||
return (await res.json())?.data ?? [];
|
||||
}
|
||||
|
||||
export const WARN = 80, CRIT = 90;
|
||||
const pct = (used, total) => (total > 0 ? Math.round((used / total) * 100) : null);
|
||||
const sev = p => (p == null ? 'ok' : p >= CRIT ? 'crit' : p >= WARN ? 'warn' : 'ok');
|
||||
const worstOf = items => items.reduce(
|
||||
(w, x) => (x.status === 'crit' || w === 'crit') ? 'crit' : (x.status === 'warn' || w === 'warn') ? 'warn' : 'ok', 'ok');
|
||||
|
||||
// Pure: fold /nodes/*/disks/zfs + /cluster/resources(storage,vm) into the card shape.
|
||||
export function normalizeStorage(storageRes = [], vmRes = [], zfsByNode = {}) {
|
||||
// Imported ZFS pools (health + usage)
|
||||
const pools = [];
|
||||
for (const [node, list] of Object.entries(zfsByNode)) {
|
||||
for (const z of (list || [])) {
|
||||
const p = pct(z.alloc, z.size);
|
||||
pools.push({
|
||||
name: z.name, node, health: z.health, used: z.alloc, total: z.size, pct: p,
|
||||
status: z.health !== 'ONLINE' ? 'crit' : sev(p)
|
||||
});
|
||||
}
|
||||
}
|
||||
pools.sort((a, b) => a.name.localeCompare(b.name) || a.node.localeCompare(b.node));
|
||||
|
||||
// zfspool storages that are configured but NOT available = a pool that has dropped
|
||||
// out (or never imported). This is the donatello/leonardo signal.
|
||||
const down = storageRes
|
||||
.filter(s => s.plugintype === 'zfspool' && s.status !== 'available')
|
||||
.map(s => ({ name: s.storage, node: s.node, state: s.status || 'unavailable', status: 'crit' }))
|
||||
.sort((a, b) => a.name.localeCompare(b.name) || a.node.localeCompare(b.node));
|
||||
|
||||
// Per-guest rootfs fill. LXC report disk/maxdisk; QEMU usually report disk=0
|
||||
// (no agent) so they're skipped rather than shown as 0%.
|
||||
const guests = vmRes
|
||||
.filter(v => v.type === 'lxc' && v.maxdisk > 0 && v.disk > 0)
|
||||
.map(v => {
|
||||
const p = pct(v.disk, v.maxdisk);
|
||||
return { vmid: v.vmid, name: v.name, node: v.node, used: v.disk, total: v.maxdisk, pct: p, status: sev(p) };
|
||||
})
|
||||
.sort((a, b) => b.pct - a.pct);
|
||||
|
||||
const alerts = [
|
||||
...down.map(d => `${d.name} (${d.node}) ${d.state}`),
|
||||
...pools.filter(p => p.health !== 'ONLINE').map(p => `pool ${p.name} ${p.health}`),
|
||||
...guests.filter(g => g.status !== 'ok').map(g => `CT ${g.vmid} ${g.name} ${g.pct}%`)
|
||||
];
|
||||
|
||||
return { worst: worstOf([...pools, ...down, ...guests]), pools, down, guests, alerts };
|
||||
}
|
||||
|
||||
export async function storageHealth(opts = {}) {
|
||||
const cfg = {
|
||||
apiUrl: opts.apiUrl || process.env.PROXMOX_API_URL,
|
||||
token: opts.token || process.env.PROXMOX_RO_TOKEN || process.env.PROXMOX_API_TOKEN,
|
||||
fetchImpl: opts.fetchImpl || fetch
|
||||
};
|
||||
if (!cfg.apiUrl || !cfg.token) return { error: 'proxmox_not_configured', at: Date.now() };
|
||||
try {
|
||||
const [storageRes, vmRes, nodes] = await Promise.all([
|
||||
pveGet('/cluster/resources?type=storage', cfg),
|
||||
pveGet('/cluster/resources?type=vm', cfg),
|
||||
pveGet('/nodes', cfg)
|
||||
]);
|
||||
const zfsByNode = {};
|
||||
await Promise.all((nodes || [])
|
||||
.filter(n => n.status === 'online')
|
||||
.map(async n => { zfsByNode[n.node] = await pveGet(`/nodes/${n.node}/disks/zfs`, cfg).catch(() => []); }));
|
||||
return { ...normalizeStorage(storageRes, vmRes, zfsByNode), at: Date.now() };
|
||||
} catch (e) {
|
||||
return { error: String(e.message || e), at: Date.now() };
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user