feat(infra): commit live infra-audit/cluster work to reconcile git with prod

This work (network_hosts inventory + infra_audit MCP tool, /api/cluster +
Sacred Valley cluster card, topbar cluster-health pill + SW self-heal) was
built in an earlier session and DEPLOYED to CT 311 as alpha.24–26, but was
never committed to git — prod was running code absent from the repo. Commits
it as-is (already prod-validated) so git matches the live state, and restores
its alpha.24/25/26 CHANGELOG entries. Files are disjoint from the fold-in
work; both now ship together under alpha.27.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
root
2026-06-08 15:20:38 +10:00
parent ae2ea09f0c
commit b0b23ba05d
19 changed files with 606 additions and 4 deletions

View File

@@ -1,9 +1,12 @@
import { createRegistry } from '../../registry.js';
import { searchTool } from '../search.js';
import { listActionsTool, proposeActionTool } from './actions.js';
import { infraAuditTool } from './infra_audit.js';
// read (search) + her action tools. No propose_change (she fixes infra, not content).
// read (search) + her action tools + infra sanity check. No propose_change
// (she fixes infra, not content).
export const blueRegistry = createRegistry();
blueRegistry.registerTool(searchTool);
blueRegistry.registerTool(listActionsTool);
blueRegistry.registerTool(proposeActionTool);
blueRegistry.registerTool(infraAuditTool);

View File

@@ -0,0 +1,17 @@
// Little Blue's infra sanity check. Runs in the MCP child (no infra creds) — it
// calls the main server's read-only /api/infra/audit, which probes wiki-referenced
// endpoints + registered service URLs and reports anything unreachable (e.g. a
// doc/registry pointing at a stale IP) plus inventory hosts missing a MAC.
function api(env = process.env) { return { base: env.VOID_API_URL, token: env.VOID_AGENT_TOKEN }; }
export const infraAuditTool = {
name: 'infra_audit',
description: 'Run a homelab sanity check: probe every IP:port the wiki references and every monitored service, and report unreachable endpoints (stale/incorrect IPs or ports) plus inventory hosts missing a MAC. Read-only — use to verify the docs/registry match reality.',
input_schema: { type: 'object', properties: {} },
async handler(_args, _ctx, { fetchImpl = fetch } = {}) {
const { base, token } = api();
const res = await fetchImpl(`${base}/api/infra/audit`, { headers: { Authorization: `Bearer ${token}` } });
if (!res.ok) return { error: `infra_audit ${res.status}` };
return res.json();
}
};

View File

@@ -32,6 +32,8 @@ import { router as securityRouter } from './routes/security.js';
import { router as actionsRouter } from './routes/actions.js';
import { router as littleblueRouter } from './routes/littleblue.js';
import { router as aiUsageRouter } from './routes/ai_usage.js';
import { router as infraRouter } from './routes/infra.js';
import { router as clusterRouter } from './routes/cluster.js';
export function mountApi(app) {
const api = Router();
@@ -45,6 +47,8 @@ export function mountApi(app) {
api.use('/spaces/:space_id/companion', companionRouter);
api.use('/security', securityRouter);
api.use('/actions', actionsRouter);
api.use('/infra', infraRouter);
api.use('/cluster', clusterRouter);
api.use('/little-blue', littleblueRouter);
api.use('/ai-usage', aiUsageRouter);
api.use('/projects', projectsRouter);

17
lib/api/routes/cluster.js Normal file
View File

@@ -0,0 +1,17 @@
import { Router } from 'express';
import { asyncWrap } from '../errors.js';
import { clusterHealth } from '../../proxmox/cluster.js';
// Read-only cluster health for the Sacred Valley card. Cached briefly so multiple
// polling clients coalesce into one PVE call. Owner or any authed agent.
export const router = Router();
let cache = { at: 0, data: null };
const TTL = 10_000;
router.get('/', asyncWrap(async (_req, res) => {
if (cache.data && Date.now() - cache.at < TTL) return res.json(cache.data);
const data = await clusterHealth();
cache = { at: Date.now(), data };
res.json(data);
}));

26
lib/api/routes/infra.js Normal file
View File

@@ -0,0 +1,26 @@
import { Router } from 'express';
import { asyncWrap } from '../errors.js';
import { pool } from '../../db/pool.js';
import * as monitored from '../../db/repos/monitored_services.js';
import * as networkHosts from '../../db/repos/network_hosts.js';
import { runAudit, tcpProbe } from '../../infra/audit.js';
// Read-only infra sanity check: probe every IP:port referenced in the wiki and
// every enabled service URL, and surface hosts missing a recorded MAC. Available
// to the owner or any authed agent (no mutations, just TCP connects).
export const router = Router();
const probe = (host, port) => tcpProbe(host, port, 1500);
router.get('/audit', asyncWrap(async (_req, res) => {
const { rows: pages } = await pool.query(
`SELECT p.title, p.body_md FROM pages p JOIN spaces s ON s.id = p.space_id WHERE s.slug = 'wiki'`);
const services = (await monitored.listEnabled()).filter(s => /^https?:\/\//.test(s.url || ''));
const report = await runAudit({ pages, services, probe });
const missingMac = (await networkHosts.missingMac()).map(h => h.id);
res.json({ ...report, inventory: { missing_mac: missingMac } });
}));
router.get('/hosts', asyncWrap(async (_req, res) => {
res.json({ hosts: await networkHosts.all() });
}));

View File

@@ -0,0 +1,45 @@
-- 023_network_hosts.sql
-- Authoritative LAN inventory of cluster guests + hosts: id -> ip -> MAC.
-- Source of truth for router DHCP reservations and the infra_audit sanity check.
-- Pool is the whole .2-.254, so every pinned guest needs a static IP + a router
-- reservation on its MAC; this table is where we record the MAC<->IP mapping.
CREATE TABLE IF NOT EXISTS network_hosts (
id text PRIMARY KEY, -- e.g. ct100, vm200, pve-z, qdevice-pi
kind text NOT NULL, -- lxc | vm | pve-host | qdevice
name text NOT NULL,
node text, -- z | Z3 | won | -
ip text,
mac text, -- NULL when not yet captured (host down)
note text,
created_at timestamptz NOT NULL DEFAULT now(),
updated_at timestamptz NOT NULL DEFAULT now()
);
CREATE INDEX IF NOT EXISTS idx_network_hosts_ip ON network_hosts(ip);
-- Seed the current inventory (captured 2026-06-08). Idempotent: re-running keeps
-- the row but refreshes ip/mac/note so a later edit-and-migrate stays correct.
INSERT INTO network_hosts (id, kind, name, node, ip, mac, note) VALUES
('ct100','lxc','mediastack','z','192.168.1.230','BC:24:11:D8:2B:7F','Docker media host'),
('ct102','lxc','ollama','z','192.168.1.185','BC:24:11:06:89:40','Ollama (GPU)'),
('ct103','lxc','openwebui','z','192.168.1.231','BC:24:11:98:28:A1','Open WebUI'),
('ct104','lxc','bookstack','z','192.168.1.213','BC:24:11:C3:F4:0A','BookStack mirror'),
('ct105','lxc','gitea','z','192.168.1.223','BC:24:11:AA:2B:4E','Gitea (static, was DHCP)'),
('ct106','lxc','pihole','z','192.168.1.140','BC:24:11:DB:2A:39','Pi-hole DNS adblock'),
('ct107','lxc','iventoy','z','192.168.1.150','BC:24:11:9B:01:10','PXE (parked, donatello-vm rootfs)'),
('ct108','lxc','tlcapture','z','192.168.1.108','BC:24:11:6D:97:27','Farm Timelapse'),
('ct109','lxc','gramps','z','192.168.1.99','BC:24:11:8E:D3:58','Gramps Web'),
('ct110','lxc','n8n','z','192.168.1.235','BC:24:11:28:70:30','n8n'),
('ct111','lxc','magicmirror','z','192.168.1.224','BC:24:11:6C:D4:E6','MagicMirror (static, was DHCP .27)'),
('ct112','lxc','obd2','z','192.168.1.225','BC:24:11:E7:D8:BF','OBD2 telemetry (static, was DHCP .28)'),
('ct300','lxc','claude','z','192.168.1.212','BC:24:11:9E:AA:73','Claude Code workspace'),
('ct301','lxc','void1','z','192.168.1.11','BC:24:11:4D:B7:CC','Void 1.x legacy'),
('ct310','lxc','void2-db','z','192.168.1.215','BC:24:11:49:C6:29','Void 2.0 Postgres'),
('ct311','lxc','void2-app','z','192.168.1.216','BC:24:11:9B:B7:3A','Void 2.0 app'),
('vm117','vm','Pterodactyl-Deb','z','192.168.1.247','BC:24:11:37:C1:F7','Game panel (static, in-guest)'),
('vm200','vm','OpenClaw','z','192.168.1.183','BC:24:11:29:84:B9','OpenClaw agent (static, in-guest)'),
('pve-z','pve-host','z','z','192.168.1.124','00:E0:4C:0F:36:00','Cluster node 1 (GPU)'),
('pve-z3','pve-host','Z3','Z3','192.168.1.125','6C:0B:5E:78:1C:93','Cluster node 2 (HA target)'),
('qdevice-pi','qdevice','retropie','-','192.168.1.254','D8:3A:DD:22:C4:21','QDevice corosync-qnetd — reserve this MAC to .254')
ON CONFLICT (id) DO UPDATE SET
kind = EXCLUDED.kind, name = EXCLUDED.name, node = EXCLUDED.node,
ip = EXCLUDED.ip, mac = EXCLUDED.mac, note = EXCLUDED.note, updated_at = now();

View File

@@ -0,0 +1,28 @@
import { pool } from '../pool.js';
const COLS = 'id, kind, name, node, ip, mac, note, updated_at';
// Authoritative guest/host LAN inventory (id -> ip -> mac). Read-only here; the
// canonical seed lives in migration 023. Used by the infra_audit sanity check
// and as the source for router DHCP reservations.
export async function all() {
const { rows } = await pool.query(`SELECT ${COLS} FROM network_hosts ORDER BY id`);
return rows;
}
export async function get(id) {
const { rows: [r] } = await pool.query(`SELECT ${COLS} FROM network_hosts WHERE id=$1`, [id]);
return r || null;
}
// Hosts still missing a captured MAC (e.g. the Pi when it was down at seed time).
export async function missingMac() {
const { rows } = await pool.query(`SELECT ${COLS} FROM network_hosts WHERE mac IS NULL ORDER BY id`);
return rows;
}
export async function setMac(id, mac) {
const { rows: [r] } = await pool.query(
`UPDATE network_hosts SET mac=$2, updated_at=now() WHERE id=$1 RETURNING ${COLS}`, [id, mac]);
return r || null;
}

86
lib/infra/audit.js Normal file
View File

@@ -0,0 +1,86 @@
import net from 'node:net';
// Doc/infra sanity check. Pure functions with an injected `probe(host, port) ->
// Promise<bool>` so they're testable offline; the default tcpProbe is used in prod.
const LAN_RE = /(?<![\d.])(192\.168\.\d{1,3}\.\d{1,3})(?::(\d{1,5}))?(?![\d])/g;
// Pull unique LAN endpoints from free text. host-only refs come back with port:null.
export function extractEndpoints(text) {
const seen = new Map();
for (const m of String(text || '').matchAll(LAN_RE)) {
const host = m[1];
const port = m[2] ? Number(m[2]) : null;
const key = `${host}:${port ?? ''}`;
if (!seen.has(key)) seen.set(key, { host, port });
}
return [...seen.values()];
}
export function parseUrl(url) {
try {
const u = new URL(url);
const port = u.port ? Number(u.port) : (u.protocol === 'https:' ? 443 : 80);
return { host: u.hostname, port };
} catch { return null; }
}
// Default reachability probe: a TCP connect with a short timeout.
export function tcpProbe(host, port, timeoutMs = 2500) {
return new Promise((resolve) => {
const sock = new net.Socket();
let done = false;
const finish = (ok) => { if (done) return; done = true; sock.destroy(); resolve(ok); };
sock.setTimeout(timeoutMs);
sock.once('connect', () => finish(true));
sock.once('timeout', () => finish(false));
sock.once('error', () => finish(false));
sock.connect(port, host);
});
}
// Cross-check every IP:port referenced in the wiki against live reachability.
// Flags stale references (e.g. a CT that moved off an old IP) grouped by page.
export async function auditDocs({ pages, probe }) {
const map = new Map(); // host:port -> { host, port, pages:Set }
for (const p of pages || []) {
for (const ep of extractEndpoints(p.body_md)) {
const key = `${ep.host}:${ep.port ?? ''}`;
if (!map.has(key)) map.set(key, { host: ep.host, port: ep.port, pages: new Set() });
map.get(key).pages.add(p.title);
}
}
const all = [...map.values()];
const probable = all.filter(e => e.port != null);
const unprobed = all.filter(e => e.port == null).map(e => ({ host: e.host, port: null, pages: [...e.pages] }));
const unreachable = [];
for (const e of probable) {
if (!(await probe(e.host, e.port))) unreachable.push({ host: e.host, port: e.port, pages: [...e.pages] });
}
return {
ok: unreachable.length === 0,
summary: { endpoints: all.length, probed: probable.length, reachable: probable.length - unreachable.length, unreachable: unreachable.length },
unreachable,
unprobed
};
}
// Probe each registered service's LAN url; flag any that don't answer.
export async function auditServices({ services, probe }) {
let probed = 0;
const unreachable = [];
for (const s of services || []) {
const hp = parseUrl(s.url);
if (!hp) continue;
probed++;
if (!(await probe(hp.host, hp.port))) unreachable.push({ id: s.id, url: s.url, host: hp.host, port: hp.port });
}
return { ok: unreachable.length === 0, summary: { probed, unreachable: unreachable.length }, unreachable };
}
// Full sanity sweep used by the API route / MCP tool.
export async function runAudit({ pages = [], services = [], probe = tcpProbe }) {
const docs = await auditDocs({ pages, probe });
const svc = await auditServices({ services, probe });
return { ok: docs.ok && svc.ok, docs, services: svc };
}

76
lib/proxmox/cluster.js Normal file
View File

@@ -0,0 +1,76 @@
import { Agent } from 'undici';
// Read-only Proxmox cluster health for the Sacred Valley card. Uses a dedicated
// PVEAuditor token (PROXMOX_RO_TOKEN) — never the power-action token. PVE's REST
// API has no vote-count endpoint, so "quorum" here = the corosync `quorate` flag
// (from /cluster/status) plus the HA-manager quorum status (/cluster/ha/status).
let insecure;
function tlsDispatcher() {
if (process.env.PROXMOX_INSECURE_TLS !== '1') return undefined;
insecure ??= new Agent({ connect: { rejectUnauthorized: false } });
return insecure;
}
async function pveGet(path, { apiUrl, token, fetchImpl = fetch }) {
const res = await fetchImpl(`${apiUrl}/api2/json${path}`, {
headers: { Authorization: `PVEAPIToken=${token}` },
dispatcher: tlsDispatcher()
});
if (!res.ok) throw new Error(`pve ${path} -> ${res.status}`);
return (await res.json())?.data ?? [];
}
const SETTLED_STATES = new Set(['started', 'stopped', 'ignored', 'disabled']);
// Pure: fold /cluster/status + /cluster/ha/status/current into the card shape.
export function normalizeCluster(statusData = [], haData = []) {
const cluster = statusData.find(e => e.type === 'cluster') || {};
const nodes = statusData
.filter(e => e.type === 'node')
.map(n => ({ name: n.name, online: n.online === 1 || n.online === true, local: !!n.local, ip: n.ip || null }))
.sort((a, b) => a.name.localeCompare(b.name));
const quorum = haData.find(e => e.type === 'quorum') || {};
const master = haData.find(e => e.type === 'master') || {};
const fencing = haData.find(e => e.type === 'fencing') || {};
const services = haData
.filter(e => e.type === 'service')
.map(s => ({ sid: s.sid || (s.id || '').replace(/^service:/, ''), state: s.state || s.crm_state || 'unknown', node: s.node || null }))
.sort((a, b) => a.sid.localeCompare(b.sid));
const servicesError = services.filter(s => !SETTLED_STATES.has(s.state));
return {
name: cluster.name || null,
quorate: cluster.quorate === 1 || cluster.quorate === true,
nodes_total: cluster.nodes ?? nodes.length,
nodes_online: nodes.filter(n => n.online).length,
nodes,
ha: {
quorum_ok: quorum.quorate === 1 || quorum.status === 'OK',
master: master.node || null,
fencing: fencing['armed-state'] || (fencing.status ? 'armed' : null),
services_total: services.length,
services_error: servicesError.length,
services
}
};
}
export async function clusterHealth(opts = {}) {
const cfg = {
apiUrl: opts.apiUrl || process.env.PROXMOX_API_URL,
token: opts.token || process.env.PROXMOX_RO_TOKEN || process.env.PROXMOX_API_TOKEN,
fetchImpl: opts.fetchImpl || fetch
};
if (!cfg.apiUrl || !cfg.token) return { error: 'proxmox_not_configured', at: Date.now() };
try {
const [status, ha] = await Promise.all([
pveGet('/cluster/status', cfg),
pveGet('/cluster/ha/status/current', cfg).catch(() => []) // HA may be absent on a bare cluster
]);
return { ...normalizeCluster(status, ha), at: Date.now() };
} catch (e) {
return { error: String(e.message || e), at: Date.now() };
}
}