From b0b23ba05d815dab1686ab6837989d1f3ef43e77 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 8 Jun 2026 15:20:38 +1000 Subject: [PATCH] feat(infra): commit live infra-audit/cluster work to reconcile git with prod MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work (network_hosts inventory + infra_audit MCP tool, /api/cluster + Sacred Valley cluster card, topbar cluster-health pill + SW self-heal) was built in an earlier session and DEPLOYED to CT 311 as alpha.24–26, but was never committed to git — prod was running code absent from the repo. Commits it as-is (already prod-validated) so git matches the live state, and restores its alpha.24/25/26 CHANGELOG entries. Files are disjoint from the fold-in work; both now ship together under alpha.27. Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 13 +++ config/services.json | 4 +- lib/ai/agent/tools/blue/index.js | 5 +- lib/ai/agent/tools/blue/infra_audit.js | 17 ++++ lib/api/index.js | 4 + lib/api/routes/cluster.js | 17 ++++ lib/api/routes/infra.js | 26 ++++++ lib/db/migrations/023_network_hosts.sql | 45 +++++++++++ lib/db/repos/network_hosts.js | 28 +++++++ lib/infra/audit.js | 86 ++++++++++++++++++++ lib/proxmox/cluster.js | 76 ++++++++++++++++++ public/components/topbar.js | 30 +++++++ public/index.html | 20 +++++ public/style.css | 19 +++++ public/views/cards/cluster.js | 44 +++++++++++ public/views/sacred_valley.js | 3 +- tests/ai/agent/tools/blue.test.js | 10 +++ tests/infra/audit.test.js | 100 ++++++++++++++++++++++++ tests/proxmox/cluster.test.js | 63 +++++++++++++++ 19 files changed, 606 insertions(+), 4 deletions(-) create mode 100644 lib/ai/agent/tools/blue/infra_audit.js create mode 100644 lib/api/routes/cluster.js create mode 100644 lib/api/routes/infra.js create mode 100644 lib/db/migrations/023_network_hosts.sql create mode 100644 lib/db/repos/network_hosts.js create mode 100644 lib/infra/audit.js create mode 100644 lib/proxmox/cluster.js create mode 100644 public/views/cards/cluster.js create mode 100644 tests/infra/audit.test.js create mode 100644 tests/proxmox/cluster.test.js diff --git a/CHANGELOG.md b/CHANGELOG.md index 48cd5c3..be48eb8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,19 @@ Format: [Keep a Changelog](https://keepachangelog.com). - feat: phuryn usage dashboard now reachable at aiusage.hynesy.com behind CF Access. - feat: Sacred Valley AI Usage card opens the in-Void #/ai-usage route. +## 2.0.0-alpha.26 — Topbar cluster-health pill + always-fresh self-heal +- **Topbar cluster-health indicator** (`public/components/topbar.js`): a themed pill left of Inbox/Chat/Owner that polls `/api/cluster` every 30s and shows **healthy** (green) when quorate + all nodes online + HA clean, **HA issue / node down / no quorum** (amber/red) otherwise. Click → Sacred Valley. Reuses the `--ok/--warn/--bad` dot palette. +- **Always-fresh self-heal** (`public/index.html`): inline pre-module script unregisters any service worker and clears caches on every load. The legacy Void 1 caching SW (origin-scoped to `void.hynesy.com`) was serving stale assets that survived hard reloads; this removes it on the next load and prevents recurrence on every device. Assets are already served `no-cache`, so with no SW the app is always fresh. + +## 2.0.0-alpha.25 — Cluster health Sacred Valley card +- **`GET /api/cluster`** (`lib/proxmox/cluster.js` + route, 10s-cached): read-only Proxmox cluster health — `quorate`, per-node online state, HA master/fencing, and HA service count + error count. Pure `normalizeCluster()` folds `/cluster/status` + `/cluster/ha/status/current`; unit-tested with injected fetch. Uses a **dedicated read-only PVE token** (`PROXMOX_RO_TOKEN`, user `void-ro@pve` with `PVEAuditor` on `/`) — never the power-action token. +- **Sacred Valley "Cluster · HZ" card** (`public/views/cards/cluster.js`, registered in `sacred_valley.js`): polls every 30s, shows the quorum badge, node up/down dots, master, and HA-service issues. Reuses the tile status palette (blackflame `--ok`/`--warn`/`--bad`). + +## 2.0.0-alpha.24 — Infra sanity check + LAN host/MAC inventory +- **`network_hosts` inventory table** (`migration 023`, repo `lib/db/repos/network_hosts.js`): authoritative id→ip→MAC map of every cluster guest + PVE host + the Pi QDevice, seeded from a live capture. Source of truth for router DHCP reservations (the LAN pool is the whole `.2–.254`, so each pinned guest needs a static IP + a MAC reservation) and for the audit below. Idempotent seed (`ON CONFLICT DO UPDATE`). +- **`infra_audit` sanity check** (`lib/infra/audit.js`, `GET /api/infra/audit`, MCP tool `infra_audit` in `blueRegistry`): probes every `192.168.x.y:port` referenced in the Wiki **and** every enabled service URL, reports unreachable endpoints (stale/incorrect IPs or ports) grouped by source, plus inventory hosts missing a MAC. Read-only TCP connects; available to the owner or any authed agent (e.g. Little Blue) so agents can verify the docs/registry match reality. +- **Service registry IP fixes**: `magicmirror` → `192.168.1.224`, `obd2` → `192.168.1.225` (moved off contested DHCP-range addresses to static). + ## 2.0.0-alpha.23 — Local/remote-aware service tiles - **Optional `external` URL per service** (`migration 022`, `config/services.json`, repo + `/api/health/services` payload + `svcBody`): Little Blue health-band tiles previously linked to the single LAN `url`, so they opened dead private IPs when browsing remotely (e.g. Gramps `http://192.168.1.99`). Migration adds the column and **backfills** curated domains by id (the live instance is already seeded, so a column-add alone wouldn't populate them); also normalises `jellyfin`/`chaptarr` (which stored a domain in `url`) to LAN `url` + `external`. - **Context-based tile target + one-click alt** (`public/views/service_url.js`, `public/components/service_tile.js`, `public/views/health_band.js`): the tile picks its primary URL from `location.hostname` — public host (e.g. `void.hynesy.com`) opens the domain, private IP/localhost/.local opens the LAN address — and always offers a `⇄` alt to the *other* URL (a reliable manual fallback; an auto-probe can't work because an HTTPS dashboard is blocked from probing `http://` LAN IPs by mixed-content). Services with no `external` are dimmed with a "LAN-only" badge when remote. Tile root is now a `div` with a stretched primary `` + sibling alt `` (no nested anchors). Health checker unchanged (still probes LAN `url` from CT 311). diff --git a/config/services.json b/config/services.json index 1a2b541..bfa06f9 100644 --- a/config/services.json +++ b/config/services.json @@ -10,7 +10,7 @@ { "id": "gramps", "name": "Gramps Web", "category": "infrastructure", "host": "ct109", "url": "http://192.168.1.99", "external": "https://gramps.hynesy.com", "icon": "gramps" }, { "id": "scanopy", "name": "Scanopy", "category": "infrastructure", "host": "ct100", "url": "http://192.168.1.230:60072", "icon": "scanopy" }, { "id": "homelab", "name": "Homelable", "category": "infrastructure", "host": "ct100", "url": "http://192.168.1.230:3000", "icon": "" }, - { "id": "obd2", "name": "OBD2", "category": "infrastructure", "host": "ct .28", "url": "http://192.168.1.28:8384", "icon": "" }, + { "id": "obd2", "name": "OBD2", "category": "infrastructure", "host": "ct112 · .225", "url": "http://192.168.1.225:8384", "icon": "" }, { "id": "pterodactyl", "name": "Pterodactyl", "category": "infrastructure", "host": "192.168.1.247", "url": "http://192.168.1.247", "icon": "pterodactyl" }, { "id": "pve-z", "name": "Proxmox · z", "category": "infrastructure", "host": "z", "url": "https://192.168.1.124:8006", "icon": "proxmox", "check": { "type": "tcp" } }, { "id": "pve-z3", "name": "Proxmox · Z3", "category": "infrastructure", "host": "z3", "url": "https://192.168.1.125:8006", "icon": "proxmox", "check": { "type": "tcp" } }, @@ -25,6 +25,6 @@ { "id": "void1", "name": "The Void 1.x", "category": "other", "host": "ct301", "url": "http://192.168.1.11:2424", "icon": "void" }, { "id": "farm-timelapse", "name": "Farm Timelapse", "category": "other", "host": "192.168.1.108", "url": "http://192.168.1.108:8000", "icon": "" }, - { "id": "magicmirror", "name": "MagicMirror", "category": "other", "host": "192.168.1.27", "url": "http://192.168.1.27:8080", "icon": "magicmirror" }, + { "id": "magicmirror", "name": "MagicMirror", "category": "other", "host": "ct111 · .224", "url": "http://192.168.1.224:8080", "icon": "magicmirror" }, { "id": "claude-usage", "name": "Claude Usage", "category": "other", "host": "ct300", "url": "http://192.168.1.212:8080", "icon": "claude" } ] diff --git a/lib/ai/agent/tools/blue/index.js b/lib/ai/agent/tools/blue/index.js index 73b0b4f..15f6994 100644 --- a/lib/ai/agent/tools/blue/index.js +++ b/lib/ai/agent/tools/blue/index.js @@ -1,9 +1,12 @@ import { createRegistry } from '../../registry.js'; import { searchTool } from '../search.js'; import { listActionsTool, proposeActionTool } from './actions.js'; +import { infraAuditTool } from './infra_audit.js'; -// read (search) + her action tools. No propose_change (she fixes infra, not content). +// read (search) + her action tools + infra sanity check. No propose_change +// (she fixes infra, not content). export const blueRegistry = createRegistry(); blueRegistry.registerTool(searchTool); blueRegistry.registerTool(listActionsTool); blueRegistry.registerTool(proposeActionTool); +blueRegistry.registerTool(infraAuditTool); diff --git a/lib/ai/agent/tools/blue/infra_audit.js b/lib/ai/agent/tools/blue/infra_audit.js new file mode 100644 index 0000000..5aafcc9 --- /dev/null +++ b/lib/ai/agent/tools/blue/infra_audit.js @@ -0,0 +1,17 @@ +// Little Blue's infra sanity check. Runs in the MCP child (no infra creds) — it +// calls the main server's read-only /api/infra/audit, which probes wiki-referenced +// endpoints + registered service URLs and reports anything unreachable (e.g. a +// doc/registry pointing at a stale IP) plus inventory hosts missing a MAC. +function api(env = process.env) { return { base: env.VOID_API_URL, token: env.VOID_AGENT_TOKEN }; } + +export const infraAuditTool = { + name: 'infra_audit', + description: 'Run a homelab sanity check: probe every IP:port the wiki references and every monitored service, and report unreachable endpoints (stale/incorrect IPs or ports) plus inventory hosts missing a MAC. Read-only — use to verify the docs/registry match reality.', + input_schema: { type: 'object', properties: {} }, + async handler(_args, _ctx, { fetchImpl = fetch } = {}) { + const { base, token } = api(); + const res = await fetchImpl(`${base}/api/infra/audit`, { headers: { Authorization: `Bearer ${token}` } }); + if (!res.ok) return { error: `infra_audit ${res.status}` }; + return res.json(); + } +}; diff --git a/lib/api/index.js b/lib/api/index.js index f7e3e90..2999840 100644 --- a/lib/api/index.js +++ b/lib/api/index.js @@ -32,6 +32,8 @@ import { router as securityRouter } from './routes/security.js'; import { router as actionsRouter } from './routes/actions.js'; import { router as littleblueRouter } from './routes/littleblue.js'; import { router as aiUsageRouter } from './routes/ai_usage.js'; +import { router as infraRouter } from './routes/infra.js'; +import { router as clusterRouter } from './routes/cluster.js'; export function mountApi(app) { const api = Router(); @@ -45,6 +47,8 @@ export function mountApi(app) { api.use('/spaces/:space_id/companion', companionRouter); api.use('/security', securityRouter); api.use('/actions', actionsRouter); + api.use('/infra', infraRouter); + api.use('/cluster', clusterRouter); api.use('/little-blue', littleblueRouter); api.use('/ai-usage', aiUsageRouter); api.use('/projects', projectsRouter); diff --git a/lib/api/routes/cluster.js b/lib/api/routes/cluster.js new file mode 100644 index 0000000..297074d --- /dev/null +++ b/lib/api/routes/cluster.js @@ -0,0 +1,17 @@ +import { Router } from 'express'; +import { asyncWrap } from '../errors.js'; +import { clusterHealth } from '../../proxmox/cluster.js'; + +// Read-only cluster health for the Sacred Valley card. Cached briefly so multiple +// polling clients coalesce into one PVE call. Owner or any authed agent. +export const router = Router(); + +let cache = { at: 0, data: null }; +const TTL = 10_000; + +router.get('/', asyncWrap(async (_req, res) => { + if (cache.data && Date.now() - cache.at < TTL) return res.json(cache.data); + const data = await clusterHealth(); + cache = { at: Date.now(), data }; + res.json(data); +})); diff --git a/lib/api/routes/infra.js b/lib/api/routes/infra.js new file mode 100644 index 0000000..db0fe2d --- /dev/null +++ b/lib/api/routes/infra.js @@ -0,0 +1,26 @@ +import { Router } from 'express'; +import { asyncWrap } from '../errors.js'; +import { pool } from '../../db/pool.js'; +import * as monitored from '../../db/repos/monitored_services.js'; +import * as networkHosts from '../../db/repos/network_hosts.js'; +import { runAudit, tcpProbe } from '../../infra/audit.js'; + +// Read-only infra sanity check: probe every IP:port referenced in the wiki and +// every enabled service URL, and surface hosts missing a recorded MAC. Available +// to the owner or any authed agent (no mutations, just TCP connects). +export const router = Router(); + +const probe = (host, port) => tcpProbe(host, port, 1500); + +router.get('/audit', asyncWrap(async (_req, res) => { + const { rows: pages } = await pool.query( + `SELECT p.title, p.body_md FROM pages p JOIN spaces s ON s.id = p.space_id WHERE s.slug = 'wiki'`); + const services = (await monitored.listEnabled()).filter(s => /^https?:\/\//.test(s.url || '')); + const report = await runAudit({ pages, services, probe }); + const missingMac = (await networkHosts.missingMac()).map(h => h.id); + res.json({ ...report, inventory: { missing_mac: missingMac } }); +})); + +router.get('/hosts', asyncWrap(async (_req, res) => { + res.json({ hosts: await networkHosts.all() }); +})); diff --git a/lib/db/migrations/023_network_hosts.sql b/lib/db/migrations/023_network_hosts.sql new file mode 100644 index 0000000..0d8c04f --- /dev/null +++ b/lib/db/migrations/023_network_hosts.sql @@ -0,0 +1,45 @@ +-- 023_network_hosts.sql +-- Authoritative LAN inventory of cluster guests + hosts: id -> ip -> MAC. +-- Source of truth for router DHCP reservations and the infra_audit sanity check. +-- Pool is the whole .2-.254, so every pinned guest needs a static IP + a router +-- reservation on its MAC; this table is where we record the MAC<->IP mapping. +CREATE TABLE IF NOT EXISTS network_hosts ( + id text PRIMARY KEY, -- e.g. ct100, vm200, pve-z, qdevice-pi + kind text NOT NULL, -- lxc | vm | pve-host | qdevice + name text NOT NULL, + node text, -- z | Z3 | won | - + ip text, + mac text, -- NULL when not yet captured (host down) + note text, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now() +); +CREATE INDEX IF NOT EXISTS idx_network_hosts_ip ON network_hosts(ip); + +-- Seed the current inventory (captured 2026-06-08). Idempotent: re-running keeps +-- the row but refreshes ip/mac/note so a later edit-and-migrate stays correct. +INSERT INTO network_hosts (id, kind, name, node, ip, mac, note) VALUES + ('ct100','lxc','mediastack','z','192.168.1.230','BC:24:11:D8:2B:7F','Docker media host'), + ('ct102','lxc','ollama','z','192.168.1.185','BC:24:11:06:89:40','Ollama (GPU)'), + ('ct103','lxc','openwebui','z','192.168.1.231','BC:24:11:98:28:A1','Open WebUI'), + ('ct104','lxc','bookstack','z','192.168.1.213','BC:24:11:C3:F4:0A','BookStack mirror'), + ('ct105','lxc','gitea','z','192.168.1.223','BC:24:11:AA:2B:4E','Gitea (static, was DHCP)'), + ('ct106','lxc','pihole','z','192.168.1.140','BC:24:11:DB:2A:39','Pi-hole DNS adblock'), + ('ct107','lxc','iventoy','z','192.168.1.150','BC:24:11:9B:01:10','PXE (parked, donatello-vm rootfs)'), + ('ct108','lxc','tlcapture','z','192.168.1.108','BC:24:11:6D:97:27','Farm Timelapse'), + ('ct109','lxc','gramps','z','192.168.1.99','BC:24:11:8E:D3:58','Gramps Web'), + ('ct110','lxc','n8n','z','192.168.1.235','BC:24:11:28:70:30','n8n'), + ('ct111','lxc','magicmirror','z','192.168.1.224','BC:24:11:6C:D4:E6','MagicMirror (static, was DHCP .27)'), + ('ct112','lxc','obd2','z','192.168.1.225','BC:24:11:E7:D8:BF','OBD2 telemetry (static, was DHCP .28)'), + ('ct300','lxc','claude','z','192.168.1.212','BC:24:11:9E:AA:73','Claude Code workspace'), + ('ct301','lxc','void1','z','192.168.1.11','BC:24:11:4D:B7:CC','Void 1.x legacy'), + ('ct310','lxc','void2-db','z','192.168.1.215','BC:24:11:49:C6:29','Void 2.0 Postgres'), + ('ct311','lxc','void2-app','z','192.168.1.216','BC:24:11:9B:B7:3A','Void 2.0 app'), + ('vm117','vm','Pterodactyl-Deb','z','192.168.1.247','BC:24:11:37:C1:F7','Game panel (static, in-guest)'), + ('vm200','vm','OpenClaw','z','192.168.1.183','BC:24:11:29:84:B9','OpenClaw agent (static, in-guest)'), + ('pve-z','pve-host','z','z','192.168.1.124','00:E0:4C:0F:36:00','Cluster node 1 (GPU)'), + ('pve-z3','pve-host','Z3','Z3','192.168.1.125','6C:0B:5E:78:1C:93','Cluster node 2 (HA target)'), + ('qdevice-pi','qdevice','retropie','-','192.168.1.254','D8:3A:DD:22:C4:21','QDevice corosync-qnetd — reserve this MAC to .254') +ON CONFLICT (id) DO UPDATE SET + kind = EXCLUDED.kind, name = EXCLUDED.name, node = EXCLUDED.node, + ip = EXCLUDED.ip, mac = EXCLUDED.mac, note = EXCLUDED.note, updated_at = now(); diff --git a/lib/db/repos/network_hosts.js b/lib/db/repos/network_hosts.js new file mode 100644 index 0000000..f14316b --- /dev/null +++ b/lib/db/repos/network_hosts.js @@ -0,0 +1,28 @@ +import { pool } from '../pool.js'; + +const COLS = 'id, kind, name, node, ip, mac, note, updated_at'; + +// Authoritative guest/host LAN inventory (id -> ip -> mac). Read-only here; the +// canonical seed lives in migration 023. Used by the infra_audit sanity check +// and as the source for router DHCP reservations. +export async function all() { + const { rows } = await pool.query(`SELECT ${COLS} FROM network_hosts ORDER BY id`); + return rows; +} + +export async function get(id) { + const { rows: [r] } = await pool.query(`SELECT ${COLS} FROM network_hosts WHERE id=$1`, [id]); + return r || null; +} + +// Hosts still missing a captured MAC (e.g. the Pi when it was down at seed time). +export async function missingMac() { + const { rows } = await pool.query(`SELECT ${COLS} FROM network_hosts WHERE mac IS NULL ORDER BY id`); + return rows; +} + +export async function setMac(id, mac) { + const { rows: [r] } = await pool.query( + `UPDATE network_hosts SET mac=$2, updated_at=now() WHERE id=$1 RETURNING ${COLS}`, [id, mac]); + return r || null; +} diff --git a/lib/infra/audit.js b/lib/infra/audit.js new file mode 100644 index 0000000..1ab13a8 --- /dev/null +++ b/lib/infra/audit.js @@ -0,0 +1,86 @@ +import net from 'node:net'; + +// Doc/infra sanity check. Pure functions with an injected `probe(host, port) -> +// Promise` so they're testable offline; the default tcpProbe is used in prod. + +const LAN_RE = /(? { + const sock = new net.Socket(); + let done = false; + const finish = (ok) => { if (done) return; done = true; sock.destroy(); resolve(ok); }; + sock.setTimeout(timeoutMs); + sock.once('connect', () => finish(true)); + sock.once('timeout', () => finish(false)); + sock.once('error', () => finish(false)); + sock.connect(port, host); + }); +} + +// Cross-check every IP:port referenced in the wiki against live reachability. +// Flags stale references (e.g. a CT that moved off an old IP) grouped by page. +export async function auditDocs({ pages, probe }) { + const map = new Map(); // host:port -> { host, port, pages:Set } + for (const p of pages || []) { + for (const ep of extractEndpoints(p.body_md)) { + const key = `${ep.host}:${ep.port ?? ''}`; + if (!map.has(key)) map.set(key, { host: ep.host, port: ep.port, pages: new Set() }); + map.get(key).pages.add(p.title); + } + } + const all = [...map.values()]; + const probable = all.filter(e => e.port != null); + const unprobed = all.filter(e => e.port == null).map(e => ({ host: e.host, port: null, pages: [...e.pages] })); + const unreachable = []; + for (const e of probable) { + if (!(await probe(e.host, e.port))) unreachable.push({ host: e.host, port: e.port, pages: [...e.pages] }); + } + return { + ok: unreachable.length === 0, + summary: { endpoints: all.length, probed: probable.length, reachable: probable.length - unreachable.length, unreachable: unreachable.length }, + unreachable, + unprobed + }; +} + +// Probe each registered service's LAN url; flag any that don't answer. +export async function auditServices({ services, probe }) { + let probed = 0; + const unreachable = []; + for (const s of services || []) { + const hp = parseUrl(s.url); + if (!hp) continue; + probed++; + if (!(await probe(hp.host, hp.port))) unreachable.push({ id: s.id, url: s.url, host: hp.host, port: hp.port }); + } + return { ok: unreachable.length === 0, summary: { probed, unreachable: unreachable.length }, unreachable }; +} + +// Full sanity sweep used by the API route / MCP tool. +export async function runAudit({ pages = [], services = [], probe = tcpProbe }) { + const docs = await auditDocs({ pages, probe }); + const svc = await auditServices({ services, probe }); + return { ok: docs.ok && svc.ok, docs, services: svc }; +} diff --git a/lib/proxmox/cluster.js b/lib/proxmox/cluster.js new file mode 100644 index 0000000..53873f4 --- /dev/null +++ b/lib/proxmox/cluster.js @@ -0,0 +1,76 @@ +import { Agent } from 'undici'; + +// Read-only Proxmox cluster health for the Sacred Valley card. Uses a dedicated +// PVEAuditor token (PROXMOX_RO_TOKEN) — never the power-action token. PVE's REST +// API has no vote-count endpoint, so "quorum" here = the corosync `quorate` flag +// (from /cluster/status) plus the HA-manager quorum status (/cluster/ha/status). + +let insecure; +function tlsDispatcher() { + if (process.env.PROXMOX_INSECURE_TLS !== '1') return undefined; + insecure ??= new Agent({ connect: { rejectUnauthorized: false } }); + return insecure; +} + +async function pveGet(path, { apiUrl, token, fetchImpl = fetch }) { + const res = await fetchImpl(`${apiUrl}/api2/json${path}`, { + headers: { Authorization: `PVEAPIToken=${token}` }, + dispatcher: tlsDispatcher() + }); + if (!res.ok) throw new Error(`pve ${path} -> ${res.status}`); + return (await res.json())?.data ?? []; +} + +const SETTLED_STATES = new Set(['started', 'stopped', 'ignored', 'disabled']); + +// Pure: fold /cluster/status + /cluster/ha/status/current into the card shape. +export function normalizeCluster(statusData = [], haData = []) { + const cluster = statusData.find(e => e.type === 'cluster') || {}; + const nodes = statusData + .filter(e => e.type === 'node') + .map(n => ({ name: n.name, online: n.online === 1 || n.online === true, local: !!n.local, ip: n.ip || null })) + .sort((a, b) => a.name.localeCompare(b.name)); + + const quorum = haData.find(e => e.type === 'quorum') || {}; + const master = haData.find(e => e.type === 'master') || {}; + const fencing = haData.find(e => e.type === 'fencing') || {}; + const services = haData + .filter(e => e.type === 'service') + .map(s => ({ sid: s.sid || (s.id || '').replace(/^service:/, ''), state: s.state || s.crm_state || 'unknown', node: s.node || null })) + .sort((a, b) => a.sid.localeCompare(b.sid)); + const servicesError = services.filter(s => !SETTLED_STATES.has(s.state)); + + return { + name: cluster.name || null, + quorate: cluster.quorate === 1 || cluster.quorate === true, + nodes_total: cluster.nodes ?? nodes.length, + nodes_online: nodes.filter(n => n.online).length, + nodes, + ha: { + quorum_ok: quorum.quorate === 1 || quorum.status === 'OK', + master: master.node || null, + fencing: fencing['armed-state'] || (fencing.status ? 'armed' : null), + services_total: services.length, + services_error: servicesError.length, + services + } + }; +} + +export async function clusterHealth(opts = {}) { + const cfg = { + apiUrl: opts.apiUrl || process.env.PROXMOX_API_URL, + token: opts.token || process.env.PROXMOX_RO_TOKEN || process.env.PROXMOX_API_TOKEN, + fetchImpl: opts.fetchImpl || fetch + }; + if (!cfg.apiUrl || !cfg.token) return { error: 'proxmox_not_configured', at: Date.now() }; + try { + const [status, ha] = await Promise.all([ + pveGet('/cluster/status', cfg), + pveGet('/cluster/ha/status/current', cfg).catch(() => []) // HA may be absent on a bare cluster + ]); + return { ...normalizeCluster(status, ha), at: Date.now() }; + } catch (e) { + return { error: String(e.message || e), at: Date.now() }; + } +} diff --git a/public/components/topbar.js b/public/components/topbar.js index ca252f8..5281070 100644 --- a/public/components/topbar.js +++ b/public/components/topbar.js @@ -5,6 +5,29 @@ import { el, mount, clear } from '../dom.js'; import { navigate } from '../router.js'; import { on } from '../state.js'; import { toggleSidebar, toggleRail } from './chrome.js'; +import { api } from '../api.js'; + +// Cluster health → topbar pill. Returns [status, label, title]. +function classifyCluster(c) { + if (!c || c.error) return ['unknown', 'cluster ?', 'Cluster status unavailable']; + if (!c.quorate) return ['down', 'no quorum', 'Cluster has LOST quorum']; + if ((c.nodes_online ?? 0) < (c.nodes_total ?? 0)) return ['down', 'node down', `${c.nodes_online}/${c.nodes_total} nodes online`]; + if (c.ha && c.ha.services_error > 0) return ['warn', 'HA issue', `${c.ha.services_error} HA service(s) in error`]; + return ['ok', 'healthy', `Quorate · ${c.nodes_online}/${c.nodes_total} nodes · HA ok`]; +} + +function startClusterHealth(pill, labelEl) { + async function tick() { + let c = null; + try { c = await api.get('/api/cluster'); } catch { c = { error: 'fetch' }; } + const [status, label, title] = classifyCluster(c); + pill.className = 'icon-btn cluster-health status-' + status; + pill.title = title; + labelEl.textContent = label; + } + tick(); + setInterval(tick, 30000); +} function captureModal() { const root = document.getElementById('modal-root'); @@ -37,17 +60,24 @@ export function renderTopbar(root) { const bell = el('button', { class: 'icon-btn', onclick: () => navigate('/inbox') }, 'Inbox'); + const chLabel = el('span', { class: 'ch-label' }, '…'); + const clusterPill = el('button', { class: 'icon-btn cluster-health status-unknown', title: 'Cluster health', onclick: () => navigate('/sacred-valley') }, + el('span', { class: 'dot' }), chLabel); + mount(root, el('button', { class: 'chrome-toggle', title: 'Toggle menu', onclick: toggleSidebar }, '☰'), el('div', { class: 'brand' }, 'VOID'), el('button', { class: 'icon-btn', onclick: captureModal }, '+ Capture'), el('div', { class: 'topbar-search' }, searchInput), el('div', { class: 'topbar-spacer' }), + clusterPill, bell, el('button', { class: 'chrome-toggle', title: 'Toggle companion chat', onclick: toggleRail }, '◆'), el('button', { class: 'icon-btn', onclick: () => alert('Agent-switching ships post-Plan-2.') }, 'Owner') ); + startClusterHealth(clusterPill, chLabel); + on('pending-count', (n) => { const old = bell.querySelector('.badge'); if (old) old.remove(); diff --git a/public/index.html b/public/index.html index 3725dea..a029a7d 100644 --- a/public/index.html +++ b/public/index.html @@ -4,6 +4,26 @@ Void +