const { useEffect, useRef, useState } = React; const AUDIO_SEND_SAMPLES = 640; // 40ms @ 16kHz function App() { const [status, setStatus] = useState("initializing"); const [streamStartedAt, setStreamStartedAt] = useState(null); const [processedChunks, setProcessedChunks] = useState(0); const [latencyMetrics, setLatencyMetrics] = useState({ source_audio_ms: 0, vc_ms: 0, e2e_ms: 0, model_e2e_ms: 0, context_ms: 0, queue_ms: 0, playback_wait_ms: 0, network_est_ms: 0, network_playout_ms: 0, true_e2e_ms: 0, current_ms: 0, smooth_ms: 0, future_ms: 0, streaming_core_ms: 0, streaming_total_ms: 0, }); const [targetOptions, setTargetOptions] = useState([]); const [selectedTargetId, setSelectedTargetId] = useState(""); const [targetLoading, setTargetLoading] = useState(false); const [targetError, setTargetError] = useState(""); const [isUploadingTarget, setIsUploadingTarget] = useState(false); const [isRecordingTarget, setIsRecordingTarget] = useState(false); const [recentSessions, setRecentSessions] = useState([]); const [recentSessionsLoading, setRecentSessionsLoading] = useState(false); const wsRef = useRef(null); const inputAudioContextRef = useRef(null); const playbackAudioContextRef = useRef(null); const mediaStreamRef = useRef(null); const processorRef = useRef(null); const playbackQueueRef = useRef([]); const playbackRunningRef = useRef(false); const currentSourceRef = useRef(null); const pendingChunkMetaRef = useRef([]); const latestChunkMetaRef = useRef({ totalInputAudioMs: 0, modelE2EMs: 0, }); const networkOneWayMsRef = useRef(0); const pingIntervalRef = useRef(null); const streamStartedPerfRef = useRef(null); const sessionIdRef = useRef(`${Date.now()}-${Math.random().toString(16).slice(2)}`); const userStoppingRef = useRef(false); const recordStreamRef = useRef(null); const recordAudioContextRef = useRef(null); const recordProcessorRef = useRef(null); const recordChunksRef = useRef([]); const recordSampleRateRef = useRef(16000); const sendPcmBufferRef = useRef([]); useEffect(() => () => stopSession(), []); useEffect(() => { void initializePage(); }, []); async function initializePage() { setStatus("initializing"); try { await Promise.all([loadTargets(), initVcRuntime(), loadRecentSessions()]); setStatus("idle"); } catch (_error) { } } async function loadTargets() { setTargetLoading(true); setTargetError(""); try { const response = await fetch("/api/targets"); if (!response.ok) { throw new Error(`http ${response.status}`); } const data = await response.json(); const targets = Array.isArray(data.targets) ? data.targets : []; const defaultTargetId = data.default_target_id || (targets[0] ? targets[0].id : ""); setTargetOptions(targets); setSelectedTargetId((prev) => prev || defaultTargetId || ""); } catch (error) { const message = error instanceof Error ? error.message : String(error); setTargetError(`target load failed: ${message}`); } finally { setTargetLoading(false); } } async function initVcRuntime() { try { const response = await fetch("/api/vc/init", { method: "POST" }); if (!response.ok) { const text = await response.text(); throw new Error(text || `http ${response.status}`); } } catch (error) { const message = error instanceof Error ? error.message : String(error); setStatus(`error: vc init failed: ${message}`); throw error; } } async function loadRecentSessions() { setRecentSessionsLoading(true); try { const response = await fetch("/api/sessions/recent?limit=3"); if (!response.ok) { throw new Error(`http ${response.status}`); } const data = await response.json(); const sessions = Array.isArray(data.sessions) ? data.sessions : []; setRecentSessions(sessions); } catch (_error) { setRecentSessions([]); } finally { setRecentSessionsLoading(false); } } async function uploadTargetFile(file, filenameHint = "") { if (!file) { return; } setIsUploadingTarget(true); setTargetError(""); try { const formData = new FormData(); formData.append("file", file, filenameHint || file.name || "target.wav"); const response = await fetch("/api/targets/upload", { method: "POST", body: formData, }); if (!response.ok) { const text = await response.text(); throw new Error(text || `http ${response.status}`); } const data = await response.json(); const target = data.target; if (!target || !target.id) { throw new Error("invalid upload response"); } setTargetOptions((prev) => { const filtered = prev.filter((item) => item.id !== target.id); return [...filtered, target]; }); setSelectedTargetId(target.id); } catch (error) { const message = error instanceof Error ? error.message : String(error); setTargetError(`target upload failed: ${message}`); } finally { setIsUploadingTarget(false); } } async function handleTargetFileChange(event) { const file = event.target.files && event.target.files[0]; event.target.value = ""; if (!file) { return; } await uploadTargetFile(file); } async function deleteSelectedTarget() { const target = targetOptions.find((item) => item.id === selectedTargetId); if (!target || target.type !== "upload") { return; } if (!window.confirm(`Delete uploaded target "${target.name}"?`)) { return; } setTargetError(""); try { const response = await fetch(`/api/targets?target_id=${encodeURIComponent(target.id)}`, { method: "DELETE", }); if (!response.ok) { const text = await response.text(); throw new Error(text || `http ${response.status}`); } const nextTargets = targetOptions.filter((item) => item.id !== target.id); setTargetOptions(nextTargets); setSelectedTargetId(nextTargets[0] ? nextTargets[0].id : ""); } catch (error) { const message = error instanceof Error ? error.message : String(error); setTargetError(`target delete failed: ${message}`); } } async function renameSelectedTarget() { const target = targetOptions.find((item) => item.id === selectedTargetId); if (!target || target.type !== "upload") { return; } const nextName = window.prompt("Rename uploaded target", target.name); if (nextName === null) { return; } const trimmed = nextName.trim(); if (!trimmed || trimmed === target.name) { return; } setTargetError(""); try { const response = await fetch("/api/targets/rename", { method: "POST", headers: {"Content-Type": "application/json"}, body: JSON.stringify({target_id: target.id, name: trimmed}), }); if (!response.ok) { const text = await response.text(); throw new Error(text || `http ${response.status}`); } const data = await response.json(); const renamed = data.target; if (!renamed || !renamed.id) { throw new Error("invalid rename response"); } setTargetOptions((prev) => prev.map((item) => (item.id === target.id ? renamed : item))); setSelectedTargetId(renamed.id); } catch (error) { const message = error instanceof Error ? error.message : String(error); setTargetError(`target rename failed: ${message}`); } } async function startTargetRecording() { if (isRecordingTarget) { return; } setTargetError(""); try { const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); recordStreamRef.current = stream; const context = new AudioContext(); recordAudioContextRef.current = context; recordSampleRateRef.current = context.sampleRate; recordChunksRef.current = []; const source = context.createMediaStreamSource(stream); const processor = context.createScriptProcessor(2048, 1, 1); processor.onaudioprocess = (event) => { const input = event.inputBuffer.getChannelData(0); recordChunksRef.current.push(new Float32Array(input)); }; source.connect(processor); processor.connect(context.destination); recordProcessorRef.current = processor; setIsRecordingTarget(true); } catch (error) { const message = error instanceof Error ? error.message : String(error); setTargetError(`record start failed: ${message}`); } } async function stopTargetRecording() { if (!isRecordingTarget) { return; } setIsRecordingTarget(false); if (recordProcessorRef.current) { recordProcessorRef.current.disconnect(); recordProcessorRef.current = null; } if (recordStreamRef.current) { recordStreamRef.current.getTracks().forEach((track) => track.stop()); recordStreamRef.current = null; } if (recordAudioContextRef.current) { await recordAudioContextRef.current.close(); recordAudioContextRef.current = null; } const chunks = recordChunksRef.current; recordChunksRef.current = []; if (!chunks.length) { setTargetError("recording is empty"); return; } const wavBlob = createWavBlob(chunks, recordSampleRateRef.current); const file = new File([wavBlob], `recorded-target-${Date.now()}.wav`, { type: "audio/wav" }); await uploadTargetFile(file, file.name); } async function startSession() { if (wsRef.current || isRecordingTarget) return; userStoppingRef.current = false; setStatus("connecting"); setProcessedChunks(0); setLatencyMetrics({ source_audio_ms: 0, vc_ms: 0, e2e_ms: 0, queue_ms: 0, playback_wait_ms: 0, network_est_ms: 0, model_e2e_ms: 0, context_ms: 0, network_playout_ms: 0, true_e2e_ms: 0, current_ms: 0, smooth_ms: 0, future_ms: 0, streaming_core_ms: 0, streaming_total_ms: 0, }); setStreamStartedAt(Date.now()); streamStartedPerfRef.current = performance.now(); pendingChunkMetaRef.current = []; networkOneWayMsRef.current = 0; const protocol = location.protocol === "https:" ? "wss" : "ws"; const wsHost = location.hostname === "0.0.0.0" ? `127.0.0.1${location.port ? `:${location.port}` : ""}` : location.host; const ws = new WebSocket(`${protocol}://${wsHost}/ws`); ws.binaryType = "arraybuffer"; ws.onopen = async () => { try { ws.send( JSON.stringify({ action: "session_config", sample_rate: 16000, target_id: selectedTargetId || "", }) ); ws.send(JSON.stringify({ event: "start", session_id: sessionIdRef.current })); await startMicrophone(ws); setStatus("streaming"); } catch (error) { const message = error instanceof Error ? error.message : String(error); setStatus(`error: ${message}`); try { ws.close(); } catch (_closeError) { } } }; ws.onerror = () => { if (userStoppingRef.current) { return; } setStatus("error: websocket connect failed"); }; ws.onmessage = async (event) => { if (typeof event.data === "string") { const payload = JSON.parse(event.data); if (payload.action === "vc_chunk") { const metrics = payload.data.metrics || {}; const currentMs = Number(metrics.current_ms || 0); const smoothMs = Number(metrics.smooth_ms || 0); const futureMs = Number(metrics.future_ms || 0); const vcMs = Number(metrics.vc_ms || 0); const contextMs = currentMs + smoothMs + futureMs; const modelE2EMs = contextMs + vcMs; const totalInputAudioMs = Number(payload.data.total_input_audio_ms || 0); latestChunkMetaRef.current = { totalInputAudioMs, modelE2EMs, }; const hasAudio = payload.data.has_audio !== false; if (hasAudio) { pendingChunkMetaRef.current.push({ chunkIndex: Number(payload.data.chunk_index || 0), totalInputAudioMs, modelE2EMs, }); } setProcessedChunks((prev) => prev + 1); setLatencyMetrics((prev) => ({ ...prev, source_audio_ms: Number(metrics.source_audio_ms || 0), vc_ms: vcMs, e2e_ms: Number(metrics.e2e_ms || 0), model_e2e_ms: modelE2EMs, context_ms: contextMs, queue_ms: Number(metrics.queue_ms || 0), current_ms: currentMs, smooth_ms: smoothMs, future_ms: futureMs, streaming_core_ms: Number(metrics.streaming_core_ms || 0), streaming_total_ms: Number(metrics.streaming_total_ms || 0), })); } if (payload.action === "pong") { const sentMs = Number(payload.data?.client_ts_ms || 0); if (sentMs > 0) { const rttMs = Math.max(0, Date.now() - sentMs); const oneWay = rttMs / 2; const prev = networkOneWayMsRef.current; networkOneWayMsRef.current = prev > 0 ? (prev * 0.7 + oneWay * 0.3) : oneWay; } } if (payload.action === "error") { setStatus(`error: ${payload.data.message}`); } } else { const chunkMeta = pendingChunkMetaRef.current.shift() || latestChunkMetaRef.current || null; enqueueAudio({ arrayBuffer: event.data, receivedAtPerf: performance.now(), chunkMeta, }); } }; ws.onclose = (event) => { if (pingIntervalRef.current) { clearInterval(pingIntervalRef.current); pingIntervalRef.current = null; } wsRef.current = null; if (userStoppingRef.current) { userStoppingRef.current = false; setStatus("idle"); return; } setStatus((prev) => { if (prev.startsWith("error:")) { return prev; } if (event.code !== 1000) { return `error: socket closed (${event.code})`; } return "idle"; }); }; pingIntervalRef.current = setInterval(() => { if (ws.readyState === WebSocket.OPEN) { ws.send(JSON.stringify({ action: "ping", client_ts_ms: Date.now() })); } }, 2000); wsRef.current = ws; } async function startMicrophone(ws) { const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); mediaStreamRef.current = stream; const audioContext = new AudioContext({ sampleRate: 16000 }); inputAudioContextRef.current = audioContext; const source = audioContext.createMediaStreamSource(stream); const processor = audioContext.createScriptProcessor(512, 1, 1); sendPcmBufferRef.current = []; ws.send( JSON.stringify({ event: "config_audio", sample_rate: audioContext.sampleRate, }) ); processor.onaudioprocess = (event) => { if (ws.readyState !== WebSocket.OPEN) return; const input = event.inputBuffer.getChannelData(0); const pcm = new Int16Array(input.length); for (let i = 0; i < input.length; i += 1) { const sample = Math.max(-1, Math.min(1, input[i])); pcm[i] = sample < 0 ? sample * 0x8000 : sample * 0x7fff; } const pending = sendPcmBufferRef.current; for (let i = 0; i < pcm.length; i += 1) { pending.push(pcm[i]); } while (pending.length >= AUDIO_SEND_SAMPLES) { const chunk = pending.splice(0, AUDIO_SEND_SAMPLES); ws.send(new Int16Array(chunk).buffer); } }; source.connect(processor); processor.connect(audioContext.destination); processorRef.current = processor; } function enqueueAudio(packet) { playbackQueueRef.current.push(packet); if (!playbackRunningRef.current) { void drainPlaybackQueue(); } } async function drainPlaybackQueue() { playbackRunningRef.current = true; while (playbackQueueRef.current.length > 0) { const nextPacket = playbackQueueRef.current.shift(); if (!nextPacket) { continue; } await playAudioChunk(nextPacket); } playbackRunningRef.current = false; } async function playAudioChunk(packet) { let audioContext = playbackAudioContextRef.current; if (!audioContext) { audioContext = new AudioContext({ sampleRate: 16000 }); playbackAudioContextRef.current = audioContext; } const arrayBuffer = packet.arrayBuffer; const pcm = new Int16Array(arrayBuffer); const audioBuffer = audioContext.createBuffer(1, pcm.length, 16000); const channel = audioBuffer.getChannelData(0); for (let i = 0; i < pcm.length; i += 1) { channel[i] = pcm[i] / 32768; } const source = audioContext.createBufferSource(); source.buffer = audioBuffer; source.connect(audioContext.destination); currentSourceRef.current = source; const playbackStartPerf = performance.now(); const receivedAtPerf = Number(packet.receivedAtPerf || playbackStartPerf); const chunkMeta = packet.chunkMeta; if (chunkMeta) { const modelE2EMs = Number(chunkMeta.modelE2EMs || 0); const playbackWaitMs = Math.max(0, playbackStartPerf - receivedAtPerf); const networkMs = Math.max(0, Number(networkOneWayMsRef.current || 0)); const postInferMs = networkMs + playbackWaitMs; const trueE2EMs = modelE2EMs + postInferMs; setLatencyMetrics((prev) => { return { ...prev, playback_wait_ms: Math.round(playbackWaitMs), network_est_ms: Math.round(networkMs), true_e2e_ms: Math.round(trueE2EMs), network_playout_ms: Math.round(postInferMs), }; }); } await new Promise((resolve) => { source.onended = () => { if (currentSourceRef.current === source) { currentSourceRef.current = null; } resolve(); }; source.start(); }); } function stopSession() { userStoppingRef.current = true; if (processorRef.current) { processorRef.current.disconnect(); processorRef.current = null; } if (mediaStreamRef.current) { mediaStreamRef.current.getTracks().forEach((track) => track.stop()); mediaStreamRef.current = null; } if (inputAudioContextRef.current) { inputAudioContextRef.current.close(); inputAudioContextRef.current = null; } if (playbackAudioContextRef.current) { playbackAudioContextRef.current.close(); playbackAudioContextRef.current = null; } if (currentSourceRef.current) { try { currentSourceRef.current.stop(); } catch (_error) { } currentSourceRef.current = null; } playbackQueueRef.current = []; playbackRunningRef.current = false; sendPcmBufferRef.current = []; pendingChunkMetaRef.current = []; latestChunkMetaRef.current = { totalInputAudioMs: 0, modelE2EMs: 0, }; networkOneWayMsRef.current = 0; if (pingIntervalRef.current) { clearInterval(pingIntervalRef.current); pingIntervalRef.current = null; } streamStartedPerfRef.current = null; if (wsRef.current) { if (wsRef.current.readyState === WebSocket.OPEN) { wsRef.current.send(JSON.stringify({ event: "stop", session_id: sessionIdRef.current })); } wsRef.current.close(); wsRef.current = null; } sessionIdRef.current = `${Date.now()}-${Math.random().toString(16).slice(2)}`; setStatus("idle"); setTimeout(() => { void loadRecentSessions(); }, 300); } async function deleteSavedSession(sessionId) { if (!sessionId || !window.confirm(`Delete saved audio "${sessionId}"?`)) { return; } try { const response = await fetch(`/api/sessions/audio?session_id=${encodeURIComponent(sessionId)}`, { method: "DELETE", }); if (!response.ok) { const text = await response.text(); throw new Error(text || `http ${response.status}`); } setRecentSessions((prev) => prev.filter((item) => item.session_id !== sessionId)); } catch (error) { const message = error instanceof Error ? error.message : String(error); setStatus(`error: saved audio delete failed: ${message}`); } } const durationText = streamStartedAt ? `${Math.max(0, Math.round((Date.now() - streamStartedAt) / 1000))}s` : "--"; const selectedTarget = targetOptions.find((item) => item.id === selectedTargetId) || null; const targetPreviewSrc = selectedTargetId ? `/api/targets/audio?target_id=${encodeURIComponent(selectedTargetId)}` : ""; return (

Realtime Voice Conversion

X-VC

Target Preview

{selectedTargetId ? (
{targetError ?

{targetError}

: null}

Status

{status}

Session Duration

{durationText}

Processed Chunks

{processedChunks}

Latency

E2E Total {formatLatency(latencyMetrics.true_e2e_ms)}
Model {formatLatency(latencyMetrics.model_e2e_ms)}
Network {formatLatency(latencyMetrics.network_playout_ms)}

Network = E2E Total - Model

Context {formatLatency(latencyMetrics.context_ms)}
VC {formatLatency(latencyMetrics.vc_ms)}
Network Est {formatLatency(latencyMetrics.network_est_ms)}
Playback Wait {formatLatency(latencyMetrics.playback_wait_ms)}
Current {formatLatency(latencyMetrics.current_ms)}
Smooth {formatLatency(latencyMetrics.smooth_ms)}
Future {formatLatency(latencyMetrics.future_ms)}

Saved Audio (After Stop)

{recentSessions.length === 0 ? (

{recentSessionsLoading ? "Loading..." : "No saved sessions yet"}

) : (
{recentSessions.map((item) => (

{item.session_id}

Input

{item.input_available ? (

Output

{item.output_available ? (
))}
)}
); } ReactDOM.createRoot(document.getElementById("root")).render(); function formatLatency(value) { const number = Number(value || 0); if (!Number.isFinite(number) || number < 0) { return "--"; } return `${Math.round(number)} ms`; } function createWavBlob(chunks, sampleRate) { const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0); const merged = new Float32Array(totalLength); let offset = 0; for (const chunk of chunks) { merged.set(chunk, offset); offset += chunk.length; } const pcm16 = new Int16Array(totalLength); for (let i = 0; i < totalLength; i += 1) { const sample = Math.max(-1, Math.min(1, merged[i])); pcm16[i] = sample < 0 ? sample * 0x8000 : sample * 0x7fff; } const dataSize = pcm16.length * 2; const buffer = new ArrayBuffer(44 + dataSize); const view = new DataView(buffer); writeString(view, 0, "RIFF"); view.setUint32(4, 36 + dataSize, true); writeString(view, 8, "WAVE"); writeString(view, 12, "fmt "); view.setUint32(16, 16, true); view.setUint16(20, 1, true); view.setUint16(22, 1, true); view.setUint32(24, sampleRate, true); view.setUint32(28, sampleRate * 2, true); view.setUint16(32, 2, true); view.setUint16(34, 16, true); writeString(view, 36, "data"); view.setUint32(40, dataSize, true); let wavOffset = 44; for (let i = 0; i < pcm16.length; i += 1) { view.setInt16(wavOffset, pcm16[i], true); wavOffset += 2; } return new Blob([view], { type: "audio/wav" }); } function writeString(view, offset, value) { for (let i = 0; i < value.length; i += 1) { view.setUint8(offset + i, value.charCodeAt(i)); } }