feat(health): probe + classify engine on a 60s cron

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
root
2026-06-02 22:55:03 +10:00
parent 5b05fd4730
commit af0cac4e6b
3 changed files with 71 additions and 0 deletions

View File

@@ -2,6 +2,9 @@ import cron from 'node-cron';
import { runSync } from './sync_source_docs.js'; import { runSync } from './sync_source_docs.js';
import { log } from '../log.js'; import { log } from '../log.js';
import { enqueue } from '../jobs/queue.js'; import { enqueue } from '../jobs/queue.js';
import { load } from '../health/registry.js';
import { checkAll } from '../health/checker.js';
import * as statusRepo from '../db/repos/service_status.js';
export function startCron() { export function startCron() {
// Daily at 03:00 local time // Daily at 03:00 local time
@@ -20,5 +23,13 @@ export function startCron() {
catch (e) { log.error({ err: e }, 'cron speedtest failed'); } catch (e) { log.error({ err: e }, 'cron speedtest failed'); }
}); });
cron.schedule('*/1 * * * *', async () => {
try {
const results = await checkAll(load());
for (const r of results) await statusRepo.upsert(r);
log.info({ n: results.length }, 'health check complete');
} catch (e) { log.error({ err: e }, 'health check failed'); }
});
log.info('cron started'); log.info('cron started');
} }

41
lib/health/checker.js Normal file
View File

@@ -0,0 +1,41 @@
import net from 'node:net';
const SLOW_MS = 3000;
export function classify({ ok, reachable, latency, error }) {
if (ok) return { status: latency > SLOW_MS ? 'warn' : 'ok', latency_ms: latency, detail: `${latency}ms` };
if (reachable) return { status: 'warn', latency_ms: latency ?? null, detail: 'degraded' };
return { status: 'down', latency_ms: null, detail: error || 'unreachable' };
}
// Default probe: HTTP (status 2xx/3xx) or TCP connect. Only called with
// operator-configured URLs from the registry — never user input.
export async function probe(svc) {
const started = Date.now();
const type = svc.check?.type || 'http';
try {
if (type === 'tcp') {
const u = new URL(svc.url);
await new Promise((resolve, reject) => {
const sock = net.connect({ host: u.hostname, port: Number(u.port) }, () => { sock.end(); resolve(); });
sock.setTimeout(5000); sock.on('timeout', () => { sock.destroy(); reject(new Error('timeout')); });
sock.on('error', reject);
});
return { ok: true, latency: Date.now() - started };
}
const base = svc.url.replace(/\/$/, '');
const url = base + (svc.check?.path || '');
const res = await fetch(url, { redirect: 'manual', signal: AbortSignal.timeout(6000) });
const reachable = true;
const ok = res.status >= 200 && res.status < 400;
return { ok, reachable, latency: Date.now() - started };
} catch (e) {
return { ok: false, reachable: false, latency: Date.now() - started, error: e.code || e.message };
}
}
export async function checkAll(services, probeFn = probe) {
return Promise.all(services.map(async svc => {
const c = classify(await probeFn(svc));
return { service_id: svc.id, ...c };
}));
}

View File

@@ -0,0 +1,19 @@
import { describe, it, expect, vi } from 'vitest';
import { classify, checkAll } from '../../lib/health/checker.js';
describe('health classify', () => {
it('ok when reachable and fast', () => expect(classify({ ok: true, latency: 120 }).status).toBe('ok'));
it('warn when reachable but slow', () => expect(classify({ ok: true, latency: 4000 }).status).toBe('warn'));
it('warn on non-2xx/3xx reachable', () => expect(classify({ ok: false, reachable: true, latency: 50 }).status).toBe('warn'));
it('down when unreachable', () => expect(classify({ ok: false, reachable: false, error: 'ECONN' }).status).toBe('down'));
});
describe('checkAll', () => {
it('probes each service and returns a status per id', async () => {
const probe = vi.fn().mockResolvedValue({ ok: true, latency: 30 });
const svcs = [{ id: 'a', url: 'http://x' }, { id: 'b', url: 'http://y' }];
const out = await checkAll(svcs, probe);
expect(out.map(o => o.service_id).sort()).toEqual(['a', 'b']);
expect(out.every(o => o.status === 'ok')).toBe(true);
});
});