feat(infra): commit live infra-audit/cluster work to reconcile git with prod
This work (network_hosts inventory + infra_audit MCP tool, /api/cluster + Sacred Valley cluster card, topbar cluster-health pill + SW self-heal) was built in an earlier session and DEPLOYED to CT 311 as alpha.24–26, but was never committed to git — prod was running code absent from the repo. Commits it as-is (already prod-validated) so git matches the live state, and restores its alpha.24/25/26 CHANGELOG entries. Files are disjoint from the fold-in work; both now ship together under alpha.27. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -1,9 +1,12 @@
|
||||
import { createRegistry } from '../../registry.js';
|
||||
import { searchTool } from '../search.js';
|
||||
import { listActionsTool, proposeActionTool } from './actions.js';
|
||||
import { infraAuditTool } from './infra_audit.js';
|
||||
|
||||
// read (search) + her action tools. No propose_change (she fixes infra, not content).
|
||||
// read (search) + her action tools + infra sanity check. No propose_change
|
||||
// (she fixes infra, not content).
|
||||
export const blueRegistry = createRegistry();
|
||||
blueRegistry.registerTool(searchTool);
|
||||
blueRegistry.registerTool(listActionsTool);
|
||||
blueRegistry.registerTool(proposeActionTool);
|
||||
blueRegistry.registerTool(infraAuditTool);
|
||||
|
||||
17
lib/ai/agent/tools/blue/infra_audit.js
Normal file
17
lib/ai/agent/tools/blue/infra_audit.js
Normal file
@@ -0,0 +1,17 @@
|
||||
// Little Blue's infra sanity check. Runs in the MCP child (no infra creds) — it
|
||||
// calls the main server's read-only /api/infra/audit, which probes wiki-referenced
|
||||
// endpoints + registered service URLs and reports anything unreachable (e.g. a
|
||||
// doc/registry pointing at a stale IP) plus inventory hosts missing a MAC.
|
||||
function api(env = process.env) { return { base: env.VOID_API_URL, token: env.VOID_AGENT_TOKEN }; }
|
||||
|
||||
export const infraAuditTool = {
|
||||
name: 'infra_audit',
|
||||
description: 'Run a homelab sanity check: probe every IP:port the wiki references and every monitored service, and report unreachable endpoints (stale/incorrect IPs or ports) plus inventory hosts missing a MAC. Read-only — use to verify the docs/registry match reality.',
|
||||
input_schema: { type: 'object', properties: {} },
|
||||
async handler(_args, _ctx, { fetchImpl = fetch } = {}) {
|
||||
const { base, token } = api();
|
||||
const res = await fetchImpl(`${base}/api/infra/audit`, { headers: { Authorization: `Bearer ${token}` } });
|
||||
if (!res.ok) return { error: `infra_audit ${res.status}` };
|
||||
return res.json();
|
||||
}
|
||||
};
|
||||
@@ -32,6 +32,8 @@ import { router as securityRouter } from './routes/security.js';
|
||||
import { router as actionsRouter } from './routes/actions.js';
|
||||
import { router as littleblueRouter } from './routes/littleblue.js';
|
||||
import { router as aiUsageRouter } from './routes/ai_usage.js';
|
||||
import { router as infraRouter } from './routes/infra.js';
|
||||
import { router as clusterRouter } from './routes/cluster.js';
|
||||
|
||||
export function mountApi(app) {
|
||||
const api = Router();
|
||||
@@ -45,6 +47,8 @@ export function mountApi(app) {
|
||||
api.use('/spaces/:space_id/companion', companionRouter);
|
||||
api.use('/security', securityRouter);
|
||||
api.use('/actions', actionsRouter);
|
||||
api.use('/infra', infraRouter);
|
||||
api.use('/cluster', clusterRouter);
|
||||
api.use('/little-blue', littleblueRouter);
|
||||
api.use('/ai-usage', aiUsageRouter);
|
||||
api.use('/projects', projectsRouter);
|
||||
|
||||
17
lib/api/routes/cluster.js
Normal file
17
lib/api/routes/cluster.js
Normal file
@@ -0,0 +1,17 @@
|
||||
import { Router } from 'express';
|
||||
import { asyncWrap } from '../errors.js';
|
||||
import { clusterHealth } from '../../proxmox/cluster.js';
|
||||
|
||||
// Read-only cluster health for the Sacred Valley card. Cached briefly so multiple
|
||||
// polling clients coalesce into one PVE call. Owner or any authed agent.
|
||||
export const router = Router();
|
||||
|
||||
let cache = { at: 0, data: null };
|
||||
const TTL = 10_000;
|
||||
|
||||
router.get('/', asyncWrap(async (_req, res) => {
|
||||
if (cache.data && Date.now() - cache.at < TTL) return res.json(cache.data);
|
||||
const data = await clusterHealth();
|
||||
cache = { at: Date.now(), data };
|
||||
res.json(data);
|
||||
}));
|
||||
26
lib/api/routes/infra.js
Normal file
26
lib/api/routes/infra.js
Normal file
@@ -0,0 +1,26 @@
|
||||
import { Router } from 'express';
|
||||
import { asyncWrap } from '../errors.js';
|
||||
import { pool } from '../../db/pool.js';
|
||||
import * as monitored from '../../db/repos/monitored_services.js';
|
||||
import * as networkHosts from '../../db/repos/network_hosts.js';
|
||||
import { runAudit, tcpProbe } from '../../infra/audit.js';
|
||||
|
||||
// Read-only infra sanity check: probe every IP:port referenced in the wiki and
|
||||
// every enabled service URL, and surface hosts missing a recorded MAC. Available
|
||||
// to the owner or any authed agent (no mutations, just TCP connects).
|
||||
export const router = Router();
|
||||
|
||||
const probe = (host, port) => tcpProbe(host, port, 1500);
|
||||
|
||||
router.get('/audit', asyncWrap(async (_req, res) => {
|
||||
const { rows: pages } = await pool.query(
|
||||
`SELECT p.title, p.body_md FROM pages p JOIN spaces s ON s.id = p.space_id WHERE s.slug = 'wiki'`);
|
||||
const services = (await monitored.listEnabled()).filter(s => /^https?:\/\//.test(s.url || ''));
|
||||
const report = await runAudit({ pages, services, probe });
|
||||
const missingMac = (await networkHosts.missingMac()).map(h => h.id);
|
||||
res.json({ ...report, inventory: { missing_mac: missingMac } });
|
||||
}));
|
||||
|
||||
router.get('/hosts', asyncWrap(async (_req, res) => {
|
||||
res.json({ hosts: await networkHosts.all() });
|
||||
}));
|
||||
45
lib/db/migrations/023_network_hosts.sql
Normal file
45
lib/db/migrations/023_network_hosts.sql
Normal file
@@ -0,0 +1,45 @@
|
||||
-- 023_network_hosts.sql
|
||||
-- Authoritative LAN inventory of cluster guests + hosts: id -> ip -> MAC.
|
||||
-- Source of truth for router DHCP reservations and the infra_audit sanity check.
|
||||
-- Pool is the whole .2-.254, so every pinned guest needs a static IP + a router
|
||||
-- reservation on its MAC; this table is where we record the MAC<->IP mapping.
|
||||
CREATE TABLE IF NOT EXISTS network_hosts (
|
||||
id text PRIMARY KEY, -- e.g. ct100, vm200, pve-z, qdevice-pi
|
||||
kind text NOT NULL, -- lxc | vm | pve-host | qdevice
|
||||
name text NOT NULL,
|
||||
node text, -- z | Z3 | won | -
|
||||
ip text,
|
||||
mac text, -- NULL when not yet captured (host down)
|
||||
note text,
|
||||
created_at timestamptz NOT NULL DEFAULT now(),
|
||||
updated_at timestamptz NOT NULL DEFAULT now()
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_network_hosts_ip ON network_hosts(ip);
|
||||
|
||||
-- Seed the current inventory (captured 2026-06-08). Idempotent: re-running keeps
|
||||
-- the row but refreshes ip/mac/note so a later edit-and-migrate stays correct.
|
||||
INSERT INTO network_hosts (id, kind, name, node, ip, mac, note) VALUES
|
||||
('ct100','lxc','mediastack','z','192.168.1.230','BC:24:11:D8:2B:7F','Docker media host'),
|
||||
('ct102','lxc','ollama','z','192.168.1.185','BC:24:11:06:89:40','Ollama (GPU)'),
|
||||
('ct103','lxc','openwebui','z','192.168.1.231','BC:24:11:98:28:A1','Open WebUI'),
|
||||
('ct104','lxc','bookstack','z','192.168.1.213','BC:24:11:C3:F4:0A','BookStack mirror'),
|
||||
('ct105','lxc','gitea','z','192.168.1.223','BC:24:11:AA:2B:4E','Gitea (static, was DHCP)'),
|
||||
('ct106','lxc','pihole','z','192.168.1.140','BC:24:11:DB:2A:39','Pi-hole DNS adblock'),
|
||||
('ct107','lxc','iventoy','z','192.168.1.150','BC:24:11:9B:01:10','PXE (parked, donatello-vm rootfs)'),
|
||||
('ct108','lxc','tlcapture','z','192.168.1.108','BC:24:11:6D:97:27','Farm Timelapse'),
|
||||
('ct109','lxc','gramps','z','192.168.1.99','BC:24:11:8E:D3:58','Gramps Web'),
|
||||
('ct110','lxc','n8n','z','192.168.1.235','BC:24:11:28:70:30','n8n'),
|
||||
('ct111','lxc','magicmirror','z','192.168.1.224','BC:24:11:6C:D4:E6','MagicMirror (static, was DHCP .27)'),
|
||||
('ct112','lxc','obd2','z','192.168.1.225','BC:24:11:E7:D8:BF','OBD2 telemetry (static, was DHCP .28)'),
|
||||
('ct300','lxc','claude','z','192.168.1.212','BC:24:11:9E:AA:73','Claude Code workspace'),
|
||||
('ct301','lxc','void1','z','192.168.1.11','BC:24:11:4D:B7:CC','Void 1.x legacy'),
|
||||
('ct310','lxc','void2-db','z','192.168.1.215','BC:24:11:49:C6:29','Void 2.0 Postgres'),
|
||||
('ct311','lxc','void2-app','z','192.168.1.216','BC:24:11:9B:B7:3A','Void 2.0 app'),
|
||||
('vm117','vm','Pterodactyl-Deb','z','192.168.1.247','BC:24:11:37:C1:F7','Game panel (static, in-guest)'),
|
||||
('vm200','vm','OpenClaw','z','192.168.1.183','BC:24:11:29:84:B9','OpenClaw agent (static, in-guest)'),
|
||||
('pve-z','pve-host','z','z','192.168.1.124','00:E0:4C:0F:36:00','Cluster node 1 (GPU)'),
|
||||
('pve-z3','pve-host','Z3','Z3','192.168.1.125','6C:0B:5E:78:1C:93','Cluster node 2 (HA target)'),
|
||||
('qdevice-pi','qdevice','retropie','-','192.168.1.254','D8:3A:DD:22:C4:21','QDevice corosync-qnetd — reserve this MAC to .254')
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
kind = EXCLUDED.kind, name = EXCLUDED.name, node = EXCLUDED.node,
|
||||
ip = EXCLUDED.ip, mac = EXCLUDED.mac, note = EXCLUDED.note, updated_at = now();
|
||||
28
lib/db/repos/network_hosts.js
Normal file
28
lib/db/repos/network_hosts.js
Normal file
@@ -0,0 +1,28 @@
|
||||
import { pool } from '../pool.js';
|
||||
|
||||
const COLS = 'id, kind, name, node, ip, mac, note, updated_at';
|
||||
|
||||
// Authoritative guest/host LAN inventory (id -> ip -> mac). Read-only here; the
|
||||
// canonical seed lives in migration 023. Used by the infra_audit sanity check
|
||||
// and as the source for router DHCP reservations.
|
||||
export async function all() {
|
||||
const { rows } = await pool.query(`SELECT ${COLS} FROM network_hosts ORDER BY id`);
|
||||
return rows;
|
||||
}
|
||||
|
||||
export async function get(id) {
|
||||
const { rows: [r] } = await pool.query(`SELECT ${COLS} FROM network_hosts WHERE id=$1`, [id]);
|
||||
return r || null;
|
||||
}
|
||||
|
||||
// Hosts still missing a captured MAC (e.g. the Pi when it was down at seed time).
|
||||
export async function missingMac() {
|
||||
const { rows } = await pool.query(`SELECT ${COLS} FROM network_hosts WHERE mac IS NULL ORDER BY id`);
|
||||
return rows;
|
||||
}
|
||||
|
||||
export async function setMac(id, mac) {
|
||||
const { rows: [r] } = await pool.query(
|
||||
`UPDATE network_hosts SET mac=$2, updated_at=now() WHERE id=$1 RETURNING ${COLS}`, [id, mac]);
|
||||
return r || null;
|
||||
}
|
||||
86
lib/infra/audit.js
Normal file
86
lib/infra/audit.js
Normal file
@@ -0,0 +1,86 @@
|
||||
import net from 'node:net';
|
||||
|
||||
// Doc/infra sanity check. Pure functions with an injected `probe(host, port) ->
|
||||
// Promise<bool>` so they're testable offline; the default tcpProbe is used in prod.
|
||||
|
||||
const LAN_RE = /(?<![\d.])(192\.168\.\d{1,3}\.\d{1,3})(?::(\d{1,5}))?(?![\d])/g;
|
||||
|
||||
// Pull unique LAN endpoints from free text. host-only refs come back with port:null.
|
||||
export function extractEndpoints(text) {
|
||||
const seen = new Map();
|
||||
for (const m of String(text || '').matchAll(LAN_RE)) {
|
||||
const host = m[1];
|
||||
const port = m[2] ? Number(m[2]) : null;
|
||||
const key = `${host}:${port ?? ''}`;
|
||||
if (!seen.has(key)) seen.set(key, { host, port });
|
||||
}
|
||||
return [...seen.values()];
|
||||
}
|
||||
|
||||
export function parseUrl(url) {
|
||||
try {
|
||||
const u = new URL(url);
|
||||
const port = u.port ? Number(u.port) : (u.protocol === 'https:' ? 443 : 80);
|
||||
return { host: u.hostname, port };
|
||||
} catch { return null; }
|
||||
}
|
||||
|
||||
// Default reachability probe: a TCP connect with a short timeout.
|
||||
export function tcpProbe(host, port, timeoutMs = 2500) {
|
||||
return new Promise((resolve) => {
|
||||
const sock = new net.Socket();
|
||||
let done = false;
|
||||
const finish = (ok) => { if (done) return; done = true; sock.destroy(); resolve(ok); };
|
||||
sock.setTimeout(timeoutMs);
|
||||
sock.once('connect', () => finish(true));
|
||||
sock.once('timeout', () => finish(false));
|
||||
sock.once('error', () => finish(false));
|
||||
sock.connect(port, host);
|
||||
});
|
||||
}
|
||||
|
||||
// Cross-check every IP:port referenced in the wiki against live reachability.
|
||||
// Flags stale references (e.g. a CT that moved off an old IP) grouped by page.
|
||||
export async function auditDocs({ pages, probe }) {
|
||||
const map = new Map(); // host:port -> { host, port, pages:Set }
|
||||
for (const p of pages || []) {
|
||||
for (const ep of extractEndpoints(p.body_md)) {
|
||||
const key = `${ep.host}:${ep.port ?? ''}`;
|
||||
if (!map.has(key)) map.set(key, { host: ep.host, port: ep.port, pages: new Set() });
|
||||
map.get(key).pages.add(p.title);
|
||||
}
|
||||
}
|
||||
const all = [...map.values()];
|
||||
const probable = all.filter(e => e.port != null);
|
||||
const unprobed = all.filter(e => e.port == null).map(e => ({ host: e.host, port: null, pages: [...e.pages] }));
|
||||
const unreachable = [];
|
||||
for (const e of probable) {
|
||||
if (!(await probe(e.host, e.port))) unreachable.push({ host: e.host, port: e.port, pages: [...e.pages] });
|
||||
}
|
||||
return {
|
||||
ok: unreachable.length === 0,
|
||||
summary: { endpoints: all.length, probed: probable.length, reachable: probable.length - unreachable.length, unreachable: unreachable.length },
|
||||
unreachable,
|
||||
unprobed
|
||||
};
|
||||
}
|
||||
|
||||
// Probe each registered service's LAN url; flag any that don't answer.
|
||||
export async function auditServices({ services, probe }) {
|
||||
let probed = 0;
|
||||
const unreachable = [];
|
||||
for (const s of services || []) {
|
||||
const hp = parseUrl(s.url);
|
||||
if (!hp) continue;
|
||||
probed++;
|
||||
if (!(await probe(hp.host, hp.port))) unreachable.push({ id: s.id, url: s.url, host: hp.host, port: hp.port });
|
||||
}
|
||||
return { ok: unreachable.length === 0, summary: { probed, unreachable: unreachable.length }, unreachable };
|
||||
}
|
||||
|
||||
// Full sanity sweep used by the API route / MCP tool.
|
||||
export async function runAudit({ pages = [], services = [], probe = tcpProbe }) {
|
||||
const docs = await auditDocs({ pages, probe });
|
||||
const svc = await auditServices({ services, probe });
|
||||
return { ok: docs.ok && svc.ok, docs, services: svc };
|
||||
}
|
||||
76
lib/proxmox/cluster.js
Normal file
76
lib/proxmox/cluster.js
Normal file
@@ -0,0 +1,76 @@
|
||||
import { Agent } from 'undici';
|
||||
|
||||
// Read-only Proxmox cluster health for the Sacred Valley card. Uses a dedicated
|
||||
// PVEAuditor token (PROXMOX_RO_TOKEN) — never the power-action token. PVE's REST
|
||||
// API has no vote-count endpoint, so "quorum" here = the corosync `quorate` flag
|
||||
// (from /cluster/status) plus the HA-manager quorum status (/cluster/ha/status).
|
||||
|
||||
let insecure;
|
||||
function tlsDispatcher() {
|
||||
if (process.env.PROXMOX_INSECURE_TLS !== '1') return undefined;
|
||||
insecure ??= new Agent({ connect: { rejectUnauthorized: false } });
|
||||
return insecure;
|
||||
}
|
||||
|
||||
async function pveGet(path, { apiUrl, token, fetchImpl = fetch }) {
|
||||
const res = await fetchImpl(`${apiUrl}/api2/json${path}`, {
|
||||
headers: { Authorization: `PVEAPIToken=${token}` },
|
||||
dispatcher: tlsDispatcher()
|
||||
});
|
||||
if (!res.ok) throw new Error(`pve ${path} -> ${res.status}`);
|
||||
return (await res.json())?.data ?? [];
|
||||
}
|
||||
|
||||
const SETTLED_STATES = new Set(['started', 'stopped', 'ignored', 'disabled']);
|
||||
|
||||
// Pure: fold /cluster/status + /cluster/ha/status/current into the card shape.
|
||||
export function normalizeCluster(statusData = [], haData = []) {
|
||||
const cluster = statusData.find(e => e.type === 'cluster') || {};
|
||||
const nodes = statusData
|
||||
.filter(e => e.type === 'node')
|
||||
.map(n => ({ name: n.name, online: n.online === 1 || n.online === true, local: !!n.local, ip: n.ip || null }))
|
||||
.sort((a, b) => a.name.localeCompare(b.name));
|
||||
|
||||
const quorum = haData.find(e => e.type === 'quorum') || {};
|
||||
const master = haData.find(e => e.type === 'master') || {};
|
||||
const fencing = haData.find(e => e.type === 'fencing') || {};
|
||||
const services = haData
|
||||
.filter(e => e.type === 'service')
|
||||
.map(s => ({ sid: s.sid || (s.id || '').replace(/^service:/, ''), state: s.state || s.crm_state || 'unknown', node: s.node || null }))
|
||||
.sort((a, b) => a.sid.localeCompare(b.sid));
|
||||
const servicesError = services.filter(s => !SETTLED_STATES.has(s.state));
|
||||
|
||||
return {
|
||||
name: cluster.name || null,
|
||||
quorate: cluster.quorate === 1 || cluster.quorate === true,
|
||||
nodes_total: cluster.nodes ?? nodes.length,
|
||||
nodes_online: nodes.filter(n => n.online).length,
|
||||
nodes,
|
||||
ha: {
|
||||
quorum_ok: quorum.quorate === 1 || quorum.status === 'OK',
|
||||
master: master.node || null,
|
||||
fencing: fencing['armed-state'] || (fencing.status ? 'armed' : null),
|
||||
services_total: services.length,
|
||||
services_error: servicesError.length,
|
||||
services
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
export async function clusterHealth(opts = {}) {
|
||||
const cfg = {
|
||||
apiUrl: opts.apiUrl || process.env.PROXMOX_API_URL,
|
||||
token: opts.token || process.env.PROXMOX_RO_TOKEN || process.env.PROXMOX_API_TOKEN,
|
||||
fetchImpl: opts.fetchImpl || fetch
|
||||
};
|
||||
if (!cfg.apiUrl || !cfg.token) return { error: 'proxmox_not_configured', at: Date.now() };
|
||||
try {
|
||||
const [status, ha] = await Promise.all([
|
||||
pveGet('/cluster/status', cfg),
|
||||
pveGet('/cluster/ha/status/current', cfg).catch(() => []) // HA may be absent on a bare cluster
|
||||
]);
|
||||
return { ...normalizeCluster(status, ha), at: Date.now() };
|
||||
} catch (e) {
|
||||
return { error: String(e.message || e), at: Date.now() };
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user