diff --git a/deploy/whisper/README.md b/deploy/whisper/README.md new file mode 100644 index 0000000..7bc786b --- /dev/null +++ b/deploy/whisper/README.md @@ -0,0 +1,16 @@ +# faster-whisper service (Dross voice STT) + +Runs on **CT 102** (the Ollama box, `192.168.1.185`), bare-metal (no Docker), on the +RTX A2000 with CPU fallback. OpenAI-style `/transcribe` consumed by void-app +`lib/voice/whisper.js` (`WHISPER_URL=http://192.168.1.185:8001`). + +## Install (on CT 102) +``` +scp deploy/whisper/{server.py,setup.sh} root@192.168.1.185:/opt/whisper_server.py /root/setup.sh +ssh root@192.168.1.185 'bash /root/setup.sh && install -m644 /opt/whisper_server.py /opt/whisper/server.py && systemctl enable --now whisper' +curl http://192.168.1.185:8001/health # {"ok":true,"model":"small.en","device":"cuda"} +``` +- venv at `/opt/whisper/venv`; model `small.en` (env `WHISPER_MODEL`); CUDA libs via + `nvidia-cublas-cu12`/`nvidia-cudnn-cu12` pip wheels (LD_LIBRARY_PATH in the unit). +- GPU → CPU fallback is in `server.py` `load()`. +- **CT 102 disk was expanded +20G** (was 89% full) before install. diff --git a/deploy/whisper/server.py b/deploy/whisper/server.py new file mode 100644 index 0000000..0258fd9 --- /dev/null +++ b/deploy/whisper/server.py @@ -0,0 +1,35 @@ +import os, tempfile +from fastapi import FastAPI, UploadFile, File, HTTPException +from faster_whisper import WhisperModel + +MODEL = os.environ.get("WHISPER_MODEL", "small.en") +app = FastAPI() +model = None +device_used = None + +def load(): + global model, device_used + try: + model = WhisperModel(MODEL, device="cuda", compute_type="int8_float16") + device_used = "cuda" + except Exception: + model = WhisperModel(MODEL, device="cpu", compute_type="int8") + device_used = "cpu" + +load() + +@app.get("/health") +def health(): + return {"ok": True, "model": MODEL, "device": device_used} + +@app.post("/transcribe") +async def transcribe(file: UploadFile = File(...)): + data = await file.read() + if not data: + raise HTTPException(400, "empty audio") + with tempfile.NamedTemporaryFile(suffix=".bin") as f: + f.write(data); f.flush() + segments, info = model.transcribe(f.name, beam_size=1, vad_filter=True) + text = "".join(s.text for s in segments).strip() + return {"text": text, "language": info.language, + "duration": round(info.duration, 2), "device": device_used} diff --git a/deploy/whisper/setup.sh b/deploy/whisper/setup.sh new file mode 100644 index 0000000..b428e51 --- /dev/null +++ b/deploy/whisper/setup.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -e +export DEBIAN_FRONTEND=noninteractive +apt-get update -qq +apt-get install -y -qq python3-pip python3-venv ffmpeg >/dev/null +mkdir -p /opt/whisper +python3 -m venv /opt/whisper/venv +/opt/whisper/venv/bin/pip install -q --upgrade pip +/opt/whisper/venv/bin/pip install -q faster-whisper fastapi "uvicorn[standard]" python-multipart nvidia-cublas-cu12 nvidia-cudnn-cu12 +SITE=/opt/whisper/venv/lib/python3.12/site-packages +cat > /etc/systemd/system/whisper.service < { + if (!req.file || !req.file.buffer?.length) { + return res.status(400).json({ error: { code: 'no_audio', message: 'no audio supplied' } }); + } + try { + const r = await whisper.transcribe( + req.file.buffer, req.file.originalname || 'clip.webm', req.file.mimetype || 'audio/webm'); + res.json({ text: r.text, duration: r.duration ?? null }); + } catch { + res.status(503).json({ error: { code: 'stt_unavailable', message: 'transcription service unavailable' } }); + } +})); diff --git a/lib/voice/whisper.js b/lib/voice/whisper.js new file mode 100644 index 0000000..75a36d3 --- /dev/null +++ b/lib/voice/whisper.js @@ -0,0 +1,22 @@ +// Thin client for the local faster-whisper service on CT 102 (the Ollama box). +// GPU with CPU fallback lives in the service itself; here we just POST the audio +// buffer and return the transcript. LAN-only endpoint. +const WHISPER_URL = process.env.WHISPER_URL || 'http://192.168.1.185:8001'; + +export async function transcribe(buffer, filename = 'clip.webm', mime = 'audio/webm') { + const fd = new FormData(); + fd.append('file', new Blob([buffer], { type: mime }), filename); + const res = await fetch(`${WHISPER_URL}/transcribe`, { + method: 'POST', body: fd, signal: AbortSignal.timeout(120000) + }); + if (!res.ok) throw new Error(`whisper ${res.status}`); + const j = await res.json(); + return { text: (j.text || '').trim(), duration: j.duration, device: j.device }; +} + +export async function health() { + try { + const res = await fetch(`${WHISPER_URL}/health`, { signal: AbortSignal.timeout(5000) }); + return res.ok ? await res.json() : null; + } catch { return null; } +} diff --git a/package.json b/package.json index 308d813..2e36d9a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "void-server", - "version": "2.11.0", + "version": "2.12.0", "type": "module", "private": true, "scripts": { diff --git a/public/components/dross_bubble.js b/public/components/dross_bubble.js index 5f9767f..17876f4 100644 --- a/public/components/dross_bubble.js +++ b/public/components/dross_bubble.js @@ -22,8 +22,9 @@ export async function renderDrossBubble() { const input = el('textarea', { rows: 1, placeholder: 'Ask Dross…' }); const sendBtn = el('button', { class: 'dross-send', title: 'Send' }, el('span', { html: '' })); - const mic = el('button', { class: 'dross-mic', disabled: true, title: 'Voice arrives in Phase 2' }, - el('span', { html: '' }), 'Hold to talk'); + const micLabel = el('span', {}, 'Tap to record'); + const mic = el('button', { class: 'dross-mic', title: 'Record a voice note' }, + el('span', { html: '' }), micLabel); const closeBtn = el('button', { class: 'dross-x', title: 'Close' }, '⤬'); const header = el('div', { class: 'dross-hd' }, drossAvatar(cfg.avatar, 30), el('div', { class: 'dross-who' }, 'Dross', el('small', {}, 'always here, regrettably')), closeBtn); @@ -62,6 +63,47 @@ export async function renderDrossBubble() { drag(fab, fab, true); drag(header, panel, false); + // ---- voice: tap mic to record, tap again to stop → transcribe → review-and-send ---- + let media = null, chunks = [], recording = false; + function setMic(label, rec) { micLabel.textContent = label; mic.classList.toggle('rec', !!rec); } + async function startRec() { + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + chunks = []; + const opt = (window.MediaRecorder && MediaRecorder.isTypeSupported('audio/webm;codecs=opus')) + ? { mimeType: 'audio/webm;codecs=opus' } : {}; + media = new MediaRecorder(stream, opt); + media.ondataavailable = (e) => { if (e.data && e.data.size) chunks.push(e.data); }; + media.onstop = async () => { + stream.getTracks().forEach(t => t.stop()); + await sendClip(new Blob(chunks, { type: media.mimeType || 'audio/webm' })); + }; + media.start(); + recording = true; setMic('● Recording… tap to stop', true); + } catch { + setMic('Mic blocked', false); setTimeout(() => setMic('Tap to record', false), 1800); + } + } + function stopRec() { + if (media && recording) { recording = false; setMic('Transcribing…', false); media.stop(); } + } + async function sendClip(blob) { + try { + const fd = new FormData(); fd.append('audio', blob, 'clip.webm'); + const res = await fetch('/api/voice/transcribe', { + method: 'POST', headers: { Authorization: 'Bearer ' + (localStorage.getItem('void_token') || '') }, body: fd + }); + if (!res.ok) throw new Error('stt'); + const { text } = await res.json(); + setMic('Tap to record', false); + if (text) { input.value = input.value ? (input.value + ' ' + text) : text; input.focus(); } + // voiceMode 'handsfree'/'action' (Phase 2b+) would branch here. + } catch { + setMic('Transcribe failed', false); setTimeout(() => setMic('Tap to record', false), 2000); + } + } + mic.addEventListener('click', () => recording ? stopRec() : startRec()); + window.addEventListener('dross-settings-changed', async () => { try { cfg = { ...cfg, ...(await api.get('/api/dross/settings')) }; } catch { return; } applyAccent(fab, cfg.accent); applyAccent(panel, cfg.accent); diff --git a/public/style.css b/public/style.css index 1e69878..8707b3c 100644 --- a/public/style.css +++ b/public/style.css @@ -765,6 +765,8 @@ body.drawer-open #scrim { opacity: 1; pointer-events: auto; } .dross-btnrow{display:flex;gap:10px} .dross-mic{flex:1;height:50px;border-radius:12px;border:1px solid var(--dross-dim);background:var(--dross-soft);color:var(--dross-glow);cursor:pointer;display:flex;align-items:center;justify-content:center;gap:9px;font-family:var(--font-ui);font-size:13px} .dross-mic[disabled]{opacity:.5;cursor:not-allowed} +.dross-mic.rec{background:#3a1010;border-color:var(--accent);color:#fff;animation:dross-rec 1.2s infinite} +@keyframes dross-rec{0%,100%{box-shadow:0 0 0 0 rgba(255,79,46,.5)}50%{box-shadow:0 0 0 8px rgba(255,79,46,0)}} .dross-send{width:64px;height:50px;border-radius:12px;border:1px solid var(--dross-dim);background:linear-gradient(180deg,var(--dross),var(--dross-dim));color:#fff;cursor:pointer;display:grid;place-items:center} .dross-collapse{display:flex;align-items:center;justify-content:center;gap:8px;height:34px;cursor:pointer;color:var(--muted); font-family:var(--font-ui);font-size:11px;letter-spacing:.12em;text-transform:uppercase;background:#0b0810;border-top:1px solid var(--border)} diff --git a/tests/routes/voice.test.js b/tests/routes/voice.test.js new file mode 100644 index 0000000..2c5a4d0 --- /dev/null +++ b/tests/routes/voice.test.js @@ -0,0 +1,24 @@ +import { describe, it, expect, beforeAll } from 'vitest'; +import request from 'supertest'; +import { createApp } from '../../server.js'; +import { resetDb } from '../helpers/db.js'; +import { migrateUp } from '../../lib/db/migrate.js'; + +let app; +const owner = { Authorization: 'Bearer test-token' }; +beforeAll(async () => { + await resetDb(); await migrateUp(); + process.env.OWNER_TOKEN = 'test-token'; + app = createApp(); +}); + +describe('voice transcribe route', () => { + it('401 without a token', async () => { + const res = await request(app).post('/api/voice/transcribe'); + expect(res.status).toBe(401); + }); + it('400 when no audio supplied', async () => { + const res = await request(app).post('/api/voice/transcribe').set(owner); + expect(res.status).toBe(400); + }); +});