feat(dross): voice Phase 2a — local whisper transcribe + mic (2.12.0)
faster-whisper (small.en, GPU+CPU fallback) on CT 102 → POST /api/voice/transcribe (multer→whisper client) → mic in the bubble records (MediaRecorder), uploads, drops the transcript into the input to review-and-send. Infra scripts in deploy/whisper/. Retention (P2b) next. NOTE: mic needs a secure context (the https domain), not the LAN IP. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
16
deploy/whisper/README.md
Normal file
16
deploy/whisper/README.md
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
# faster-whisper service (Dross voice STT)
|
||||||
|
|
||||||
|
Runs on **CT 102** (the Ollama box, `192.168.1.185`), bare-metal (no Docker), on the
|
||||||
|
RTX A2000 with CPU fallback. OpenAI-style `/transcribe` consumed by void-app
|
||||||
|
`lib/voice/whisper.js` (`WHISPER_URL=http://192.168.1.185:8001`).
|
||||||
|
|
||||||
|
## Install (on CT 102)
|
||||||
|
```
|
||||||
|
scp deploy/whisper/{server.py,setup.sh} root@192.168.1.185:/opt/whisper_server.py /root/setup.sh
|
||||||
|
ssh root@192.168.1.185 'bash /root/setup.sh && install -m644 /opt/whisper_server.py /opt/whisper/server.py && systemctl enable --now whisper'
|
||||||
|
curl http://192.168.1.185:8001/health # {"ok":true,"model":"small.en","device":"cuda"}
|
||||||
|
```
|
||||||
|
- venv at `/opt/whisper/venv`; model `small.en` (env `WHISPER_MODEL`); CUDA libs via
|
||||||
|
`nvidia-cublas-cu12`/`nvidia-cudnn-cu12` pip wheels (LD_LIBRARY_PATH in the unit).
|
||||||
|
- GPU → CPU fallback is in `server.py` `load()`.
|
||||||
|
- **CT 102 disk was expanded +20G** (was 89% full) before install.
|
||||||
35
deploy/whisper/server.py
Normal file
35
deploy/whisper/server.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
import os, tempfile
|
||||||
|
from fastapi import FastAPI, UploadFile, File, HTTPException
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
|
MODEL = os.environ.get("WHISPER_MODEL", "small.en")
|
||||||
|
app = FastAPI()
|
||||||
|
model = None
|
||||||
|
device_used = None
|
||||||
|
|
||||||
|
def load():
|
||||||
|
global model, device_used
|
||||||
|
try:
|
||||||
|
model = WhisperModel(MODEL, device="cuda", compute_type="int8_float16")
|
||||||
|
device_used = "cuda"
|
||||||
|
except Exception:
|
||||||
|
model = WhisperModel(MODEL, device="cpu", compute_type="int8")
|
||||||
|
device_used = "cpu"
|
||||||
|
|
||||||
|
load()
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
def health():
|
||||||
|
return {"ok": True, "model": MODEL, "device": device_used}
|
||||||
|
|
||||||
|
@app.post("/transcribe")
|
||||||
|
async def transcribe(file: UploadFile = File(...)):
|
||||||
|
data = await file.read()
|
||||||
|
if not data:
|
||||||
|
raise HTTPException(400, "empty audio")
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".bin") as f:
|
||||||
|
f.write(data); f.flush()
|
||||||
|
segments, info = model.transcribe(f.name, beam_size=1, vad_filter=True)
|
||||||
|
text = "".join(s.text for s in segments).strip()
|
||||||
|
return {"text": text, "language": info.language,
|
||||||
|
"duration": round(info.duration, 2), "device": device_used}
|
||||||
26
deploy/whisper/setup.sh
Normal file
26
deploy/whisper/setup.sh
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -e
|
||||||
|
export DEBIAN_FRONTEND=noninteractive
|
||||||
|
apt-get update -qq
|
||||||
|
apt-get install -y -qq python3-pip python3-venv ffmpeg >/dev/null
|
||||||
|
mkdir -p /opt/whisper
|
||||||
|
python3 -m venv /opt/whisper/venv
|
||||||
|
/opt/whisper/venv/bin/pip install -q --upgrade pip
|
||||||
|
/opt/whisper/venv/bin/pip install -q faster-whisper fastapi "uvicorn[standard]" python-multipart nvidia-cublas-cu12 nvidia-cudnn-cu12
|
||||||
|
SITE=/opt/whisper/venv/lib/python3.12/site-packages
|
||||||
|
cat > /etc/systemd/system/whisper.service <<UNIT
|
||||||
|
[Unit]
|
||||||
|
Description=faster-whisper transcription server (Dross voice)
|
||||||
|
After=network.target
|
||||||
|
[Service]
|
||||||
|
Type=simple
|
||||||
|
WorkingDirectory=/opt/whisper
|
||||||
|
Environment=WHISPER_MODEL=small.en
|
||||||
|
Environment=LD_LIBRARY_PATH=${SITE}/nvidia/cublas/lib:${SITE}/nvidia/cudnn/lib
|
||||||
|
ExecStart=/opt/whisper/venv/bin/uvicorn server:app --host 0.0.0.0 --port 8001
|
||||||
|
Restart=on-failure
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
UNIT
|
||||||
|
systemctl daemon-reload
|
||||||
|
echo "deps+unit installed"
|
||||||
@@ -39,6 +39,7 @@ import { router as backupsRouter } from './routes/backups.js';
|
|||||||
import { router as kuttRouter } from './routes/kutt.js';
|
import { router as kuttRouter } from './routes/kutt.js';
|
||||||
import { router as themeRouter } from './routes/theme.js';
|
import { router as themeRouter } from './routes/theme.js';
|
||||||
import { router as drossRouter } from './routes/dross.js';
|
import { router as drossRouter } from './routes/dross.js';
|
||||||
|
import { router as voiceRouter } from './routes/voice.js';
|
||||||
|
|
||||||
export function mountApi(app) {
|
export function mountApi(app) {
|
||||||
const api = Router();
|
const api = Router();
|
||||||
@@ -75,6 +76,7 @@ export function mountApi(app) {
|
|||||||
api.use('/kutt', kuttRouter);
|
api.use('/kutt', kuttRouter);
|
||||||
api.use('/theme', themeRouter);
|
api.use('/theme', themeRouter);
|
||||||
api.use('/dross', drossRouter);
|
api.use('/dross', drossRouter);
|
||||||
|
api.use('/voice', voiceRouter);
|
||||||
api.use('/pending-changes', pendingChangesRouter);
|
api.use('/pending-changes', pendingChangesRouter);
|
||||||
api.use('/audit', auditRouter);
|
api.use('/audit', auditRouter);
|
||||||
api.use('/search', searchRouter);
|
api.use('/search', searchRouter);
|
||||||
|
|||||||
24
lib/api/routes/voice.js
Normal file
24
lib/api/routes/voice.js
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
import { Router } from 'express';
|
||||||
|
import multer from 'multer';
|
||||||
|
import { asyncWrap } from '../errors.js';
|
||||||
|
import { requireOwner } from '../cap.js';
|
||||||
|
import * as whisper from '../../voice/whisper.js';
|
||||||
|
export const router = Router();
|
||||||
|
|
||||||
|
// In-memory upload; clips are small voice notes. 25 MB ceiling.
|
||||||
|
const upload = multer({ storage: multer.memoryStorage(), limits: { fileSize: 25 * 1024 * 1024 } });
|
||||||
|
|
||||||
|
// POST /api/voice/transcribe — owner-only. multipart field `audio`. Returns { text }.
|
||||||
|
// (Phase 2b will optionally persist the clip + transcript when keepClips is on.)
|
||||||
|
router.post('/transcribe', requireOwner, upload.single('audio'), asyncWrap(async (req, res) => {
|
||||||
|
if (!req.file || !req.file.buffer?.length) {
|
||||||
|
return res.status(400).json({ error: { code: 'no_audio', message: 'no audio supplied' } });
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
const r = await whisper.transcribe(
|
||||||
|
req.file.buffer, req.file.originalname || 'clip.webm', req.file.mimetype || 'audio/webm');
|
||||||
|
res.json({ text: r.text, duration: r.duration ?? null });
|
||||||
|
} catch {
|
||||||
|
res.status(503).json({ error: { code: 'stt_unavailable', message: 'transcription service unavailable' } });
|
||||||
|
}
|
||||||
|
}));
|
||||||
22
lib/voice/whisper.js
Normal file
22
lib/voice/whisper.js
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
// Thin client for the local faster-whisper service on CT 102 (the Ollama box).
|
||||||
|
// GPU with CPU fallback lives in the service itself; here we just POST the audio
|
||||||
|
// buffer and return the transcript. LAN-only endpoint.
|
||||||
|
const WHISPER_URL = process.env.WHISPER_URL || 'http://192.168.1.185:8001';
|
||||||
|
|
||||||
|
export async function transcribe(buffer, filename = 'clip.webm', mime = 'audio/webm') {
|
||||||
|
const fd = new FormData();
|
||||||
|
fd.append('file', new Blob([buffer], { type: mime }), filename);
|
||||||
|
const res = await fetch(`${WHISPER_URL}/transcribe`, {
|
||||||
|
method: 'POST', body: fd, signal: AbortSignal.timeout(120000)
|
||||||
|
});
|
||||||
|
if (!res.ok) throw new Error(`whisper ${res.status}`);
|
||||||
|
const j = await res.json();
|
||||||
|
return { text: (j.text || '').trim(), duration: j.duration, device: j.device };
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function health() {
|
||||||
|
try {
|
||||||
|
const res = await fetch(`${WHISPER_URL}/health`, { signal: AbortSignal.timeout(5000) });
|
||||||
|
return res.ok ? await res.json() : null;
|
||||||
|
} catch { return null; }
|
||||||
|
}
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "void-server",
|
"name": "void-server",
|
||||||
"version": "2.11.0",
|
"version": "2.12.0",
|
||||||
"type": "module",
|
"type": "module",
|
||||||
"private": true,
|
"private": true,
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
|||||||
@@ -22,8 +22,9 @@ export async function renderDrossBubble() {
|
|||||||
const input = el('textarea', { rows: 1, placeholder: 'Ask Dross…' });
|
const input = el('textarea', { rows: 1, placeholder: 'Ask Dross…' });
|
||||||
const sendBtn = el('button', { class: 'dross-send', title: 'Send' },
|
const sendBtn = el('button', { class: 'dross-send', title: 'Send' },
|
||||||
el('span', { html: '<svg viewBox="0 0 24 24" width="20" height="20" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M22 2 11 13M22 2l-7 20-4-9-9-4 20-7z"/></svg>' }));
|
el('span', { html: '<svg viewBox="0 0 24 24" width="20" height="20" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><path d="M22 2 11 13M22 2l-7 20-4-9-9-4 20-7z"/></svg>' }));
|
||||||
const mic = el('button', { class: 'dross-mic', disabled: true, title: 'Voice arrives in Phase 2' },
|
const micLabel = el('span', {}, 'Tap to record');
|
||||||
el('span', { html: '<svg viewBox="0 0 24 24" width="22" height="22" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect x="9" y="3" width="6" height="11" rx="3"/><path d="M5 11a7 7 0 0 0 14 0"/><line x1="12" y1="18" x2="12" y2="22"/><line x1="8" y1="22" x2="16" y2="22"/></svg>' }), 'Hold to talk');
|
const mic = el('button', { class: 'dross-mic', title: 'Record a voice note' },
|
||||||
|
el('span', { html: '<svg viewBox="0 0 24 24" width="22" height="22" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round"><rect x="9" y="3" width="6" height="11" rx="3"/><path d="M5 11a7 7 0 0 0 14 0"/><line x1="12" y1="18" x2="12" y2="22"/><line x1="8" y1="22" x2="16" y2="22"/></svg>' }), micLabel);
|
||||||
const closeBtn = el('button', { class: 'dross-x', title: 'Close' }, '⤬');
|
const closeBtn = el('button', { class: 'dross-x', title: 'Close' }, '⤬');
|
||||||
const header = el('div', { class: 'dross-hd' }, drossAvatar(cfg.avatar, 30),
|
const header = el('div', { class: 'dross-hd' }, drossAvatar(cfg.avatar, 30),
|
||||||
el('div', { class: 'dross-who' }, 'Dross', el('small', {}, 'always here, regrettably')), closeBtn);
|
el('div', { class: 'dross-who' }, 'Dross', el('small', {}, 'always here, regrettably')), closeBtn);
|
||||||
@@ -62,6 +63,47 @@ export async function renderDrossBubble() {
|
|||||||
|
|
||||||
drag(fab, fab, true); drag(header, panel, false);
|
drag(fab, fab, true); drag(header, panel, false);
|
||||||
|
|
||||||
|
// ---- voice: tap mic to record, tap again to stop → transcribe → review-and-send ----
|
||||||
|
let media = null, chunks = [], recording = false;
|
||||||
|
function setMic(label, rec) { micLabel.textContent = label; mic.classList.toggle('rec', !!rec); }
|
||||||
|
async function startRec() {
|
||||||
|
try {
|
||||||
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||||
|
chunks = [];
|
||||||
|
const opt = (window.MediaRecorder && MediaRecorder.isTypeSupported('audio/webm;codecs=opus'))
|
||||||
|
? { mimeType: 'audio/webm;codecs=opus' } : {};
|
||||||
|
media = new MediaRecorder(stream, opt);
|
||||||
|
media.ondataavailable = (e) => { if (e.data && e.data.size) chunks.push(e.data); };
|
||||||
|
media.onstop = async () => {
|
||||||
|
stream.getTracks().forEach(t => t.stop());
|
||||||
|
await sendClip(new Blob(chunks, { type: media.mimeType || 'audio/webm' }));
|
||||||
|
};
|
||||||
|
media.start();
|
||||||
|
recording = true; setMic('● Recording… tap to stop', true);
|
||||||
|
} catch {
|
||||||
|
setMic('Mic blocked', false); setTimeout(() => setMic('Tap to record', false), 1800);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
function stopRec() {
|
||||||
|
if (media && recording) { recording = false; setMic('Transcribing…', false); media.stop(); }
|
||||||
|
}
|
||||||
|
async function sendClip(blob) {
|
||||||
|
try {
|
||||||
|
const fd = new FormData(); fd.append('audio', blob, 'clip.webm');
|
||||||
|
const res = await fetch('/api/voice/transcribe', {
|
||||||
|
method: 'POST', headers: { Authorization: 'Bearer ' + (localStorage.getItem('void_token') || '') }, body: fd
|
||||||
|
});
|
||||||
|
if (!res.ok) throw new Error('stt');
|
||||||
|
const { text } = await res.json();
|
||||||
|
setMic('Tap to record', false);
|
||||||
|
if (text) { input.value = input.value ? (input.value + ' ' + text) : text; input.focus(); }
|
||||||
|
// voiceMode 'handsfree'/'action' (Phase 2b+) would branch here.
|
||||||
|
} catch {
|
||||||
|
setMic('Transcribe failed', false); setTimeout(() => setMic('Tap to record', false), 2000);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
mic.addEventListener('click', () => recording ? stopRec() : startRec());
|
||||||
|
|
||||||
window.addEventListener('dross-settings-changed', async () => {
|
window.addEventListener('dross-settings-changed', async () => {
|
||||||
try { cfg = { ...cfg, ...(await api.get('/api/dross/settings')) }; } catch { return; }
|
try { cfg = { ...cfg, ...(await api.get('/api/dross/settings')) }; } catch { return; }
|
||||||
applyAccent(fab, cfg.accent); applyAccent(panel, cfg.accent);
|
applyAccent(fab, cfg.accent); applyAccent(panel, cfg.accent);
|
||||||
|
|||||||
@@ -765,6 +765,8 @@ body.drawer-open #scrim { opacity: 1; pointer-events: auto; }
|
|||||||
.dross-btnrow{display:flex;gap:10px}
|
.dross-btnrow{display:flex;gap:10px}
|
||||||
.dross-mic{flex:1;height:50px;border-radius:12px;border:1px solid var(--dross-dim);background:var(--dross-soft);color:var(--dross-glow);cursor:pointer;display:flex;align-items:center;justify-content:center;gap:9px;font-family:var(--font-ui);font-size:13px}
|
.dross-mic{flex:1;height:50px;border-radius:12px;border:1px solid var(--dross-dim);background:var(--dross-soft);color:var(--dross-glow);cursor:pointer;display:flex;align-items:center;justify-content:center;gap:9px;font-family:var(--font-ui);font-size:13px}
|
||||||
.dross-mic[disabled]{opacity:.5;cursor:not-allowed}
|
.dross-mic[disabled]{opacity:.5;cursor:not-allowed}
|
||||||
|
.dross-mic.rec{background:#3a1010;border-color:var(--accent);color:#fff;animation:dross-rec 1.2s infinite}
|
||||||
|
@keyframes dross-rec{0%,100%{box-shadow:0 0 0 0 rgba(255,79,46,.5)}50%{box-shadow:0 0 0 8px rgba(255,79,46,0)}}
|
||||||
.dross-send{width:64px;height:50px;border-radius:12px;border:1px solid var(--dross-dim);background:linear-gradient(180deg,var(--dross),var(--dross-dim));color:#fff;cursor:pointer;display:grid;place-items:center}
|
.dross-send{width:64px;height:50px;border-radius:12px;border:1px solid var(--dross-dim);background:linear-gradient(180deg,var(--dross),var(--dross-dim));color:#fff;cursor:pointer;display:grid;place-items:center}
|
||||||
.dross-collapse{display:flex;align-items:center;justify-content:center;gap:8px;height:34px;cursor:pointer;color:var(--muted);
|
.dross-collapse{display:flex;align-items:center;justify-content:center;gap:8px;height:34px;cursor:pointer;color:var(--muted);
|
||||||
font-family:var(--font-ui);font-size:11px;letter-spacing:.12em;text-transform:uppercase;background:#0b0810;border-top:1px solid var(--border)}
|
font-family:var(--font-ui);font-size:11px;letter-spacing:.12em;text-transform:uppercase;background:#0b0810;border-top:1px solid var(--border)}
|
||||||
|
|||||||
24
tests/routes/voice.test.js
Normal file
24
tests/routes/voice.test.js
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
import { describe, it, expect, beforeAll } from 'vitest';
|
||||||
|
import request from 'supertest';
|
||||||
|
import { createApp } from '../../server.js';
|
||||||
|
import { resetDb } from '../helpers/db.js';
|
||||||
|
import { migrateUp } from '../../lib/db/migrate.js';
|
||||||
|
|
||||||
|
let app;
|
||||||
|
const owner = { Authorization: 'Bearer test-token' };
|
||||||
|
beforeAll(async () => {
|
||||||
|
await resetDb(); await migrateUp();
|
||||||
|
process.env.OWNER_TOKEN = 'test-token';
|
||||||
|
app = createApp();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('voice transcribe route', () => {
|
||||||
|
it('401 without a token', async () => {
|
||||||
|
const res = await request(app).post('/api/voice/transcribe');
|
||||||
|
expect(res.status).toBe(401);
|
||||||
|
});
|
||||||
|
it('400 when no audio supplied', async () => {
|
||||||
|
const res = await request(app).post('/api/voice/transcribe').set(owner);
|
||||||
|
expect(res.status).toBe(400);
|
||||||
|
});
|
||||||
|
});
|
||||||
Reference in New Issue
Block a user