VocRT / frontend /src /components /VoiceChat.tsx
Anurag
added threshold option and fixed some bugs
f1a245b
import React, { useRef, useState } from "react";
import { Mic, MicOff, Loader2 } from "lucide-react";
import { Button } from "@/components/ui/button";
import { useToast } from "@/hooks/use-toast";
import { useAppDispatch, useAppSelector } from "@/redux/hooks";
import { addMessage } from "@/redux/slices/chatSlice";
import { setIsListening } from "@/redux/slices/chatSlice";
import {
setConnected,
setThinking,
setTranscribing,
} from "@/redux/slices/sessionSlice";
interface MessageMetadata {
session_id: string;
sequence_id: string;
transcript: string;
}
const VoiceChat = () => {
// WebSocket and audio state
const [socketConnected, setSocketConnected] = useState(false);
const [audioStream, setAudioStream] = useState<MediaStream | null>(null);
const [audioContext, setAudioContext] = useState<AudioContext | null>(null);
const [audioContextState, setAudioContextState] = useState(false);
const [elapsedTime, setElapsedTime] = useState(0);
const [latency, setLatency] = useState(0);
const [generating, setGenerating] = useState(false);
const [isPlaying, setIsPlaying] = useState(false);
const { sessionId, transcribing, thinking, connected } = useAppSelector(
(state) => state.session
);
// App state management with Redux
const { toast } = useToast();
const dispatch = useAppDispatch();
const { activeChat } = useAppSelector((state) => state.chats);
const isListening = useAppSelector((state) => state.chat.isListening);
const { temperature, activeVoice, maxTokens, threshold, silenceDuration } =
useAppSelector((state) => state.settings);
// Refs for audio processing
const socketRef = useRef<WebSocket | null>(null);
const sourceRef = useRef<AudioBufferSourceNode | null>(null);
const audioContextRef = useRef<AudioContext | null>(null);
const playingRef = useRef<boolean>(false);
const lastShiftedRef = useRef<string | null>(null);
const bufferQueueRef = useRef<string[]>([]);
const isGrpcRef = useRef<boolean | null>(null);
const formatTime = (milliseconds: number): string => {
const totalSeconds = Math.floor(milliseconds / 1000);
const hours = Math.floor(totalSeconds / 3600);
const minutes = Math.floor((totalSeconds % 3600) / 60);
const seconds = totalSeconds % 60;
return `${String(hours).padStart(2, "0")}:${String(minutes).padStart(
2,
"0"
)}:${String(seconds).padStart(2, "0")}`;
};
const floatTo16BitPCM = (input: Float32Array): ArrayBuffer => {
const output = new Int16Array(input.length);
for (let i = 0; i < input.length; i++) {
const sample = Math.max(-1, Math.min(1, input[i]));
output[i] = sample < 0 ? sample * 0x8000 : sample * 0x7fff;
}
return output.buffer;
};
const handlePlay = async (timestamp: number): Promise<void> => {
try {
if (!isGrpcRef.current) {
return;
}
playingRef.current = true;
setIsPlaying(true);
const base64Data = bufferQueueRef.current.shift();
if (!base64Data) {
playingRef.current = false;
setIsPlaying(false);
return;
}
lastShiftedRef.current = base64Data;
const bytes = new Uint8Array(
atob(base64Data)
.split("")
.map((char) => char.charCodeAt(0))
);
let arrayBuffer = bytes.buffer;
const metadataEndIndex = bytes.indexOf(0);
const metadataStr = new TextDecoder().decode(
bytes.slice(0, metadataEndIndex)
);
const metadata = JSON.parse(metadataStr) as MessageMetadata;
const { session_id, sequence_id, transcript } = metadata;
if (sequence_id !== "-2") {
if (socketRef.current && (socketRef.current as any).interval) {
clearInterval((socketRef.current as any).interval);
(socketRef.current as any).interval = null;
setLatency((prev) => prev + 150);
dispatch(setThinking(false));
}
arrayBuffer = arrayBuffer.slice(metadataEndIndex + 1);
try {
if (audioContextRef.current?.state === "suspended") {
await audioContextRef.current.resume();
}
if (!audioContextRef.current) {
return;
}
const audioBuffer = await audioContextRef.current.decodeAudioData(
arrayBuffer
);
if (sourceRef.current) {
sourceRef.current.disconnect();
}
const source = audioContextRef.current.createBufferSource();
source.buffer = audioBuffer;
source.connect(audioContextRef.current.destination);
source.start(0);
sourceRef.current = source;
(sourceRef.current as any).session_id = session_id;
(sourceRef.current as any).sequence_id = sequence_id;
(sourceRef.current as any).transcript = transcript;
sourceRef.current.onended = () => {
lastShiftedRef.current = null;
if (
socketRef.current?.readyState === WebSocket.OPEN &&
(sourceRef.current as any)?.sequence_id
) {
socketRef.current.send(
JSON.stringify({
type: "status",
msg: {
session_id: (sourceRef.current as any)?.session_id,
sequence_id: (sourceRef.current as any)?.sequence_id,
transcript: (sourceRef.current as any)?.transcript,
},
})
);
}
if (bufferQueueRef.current.length > 0) {
playingRef.current = true;
setIsPlaying(true);
const currentTimestamp = Date.now();
handlePlay(currentTimestamp);
} else {
playingRef.current = false;
setIsPlaying(false);
}
};
} catch (error) {
console.error("Error decoding audio data:", error);
}
} else {
const startTime = Date.now();
const interval = setInterval(() => {
setLatency(Date.now() - startTime);
}, 10);
(socketRef.current as any).interval = interval;
if (bufferQueueRef.current.length > 0) {
playingRef.current = true;
setIsPlaying(true);
const currentTimestamp = Date.now();
handlePlay(currentTimestamp);
} else {
playingRef.current = false;
setIsPlaying(false);
}
}
} catch (error) {
console.error("Error in handlePlay: ", error);
}
};
const connectToRealtimeTTS = async (): Promise<void> => {
return new Promise<void>((resolve, reject) => {
try {
const newAudioContext = new (window.AudioContext ||
(window as any).webkitAudioContext)();
audioContextRef.current = newAudioContext;
setAudioContextState(true);
let websocketURL = import.meta.env.VITE_WEBSOCKET_URL;
websocketURL = import.meta.env.VITE_WEBSOCKET_URL.includes("localhost")
? `ws://${import.meta.env.VITE_WEBSOCKET_URL}/v2v`
: `wss://${import.meta.env.VITE_WEBSOCKET_URL}/v2v`;
const ws = new WebSocket(websocketURL);
ws.onopen = () => {
// Initial connection established
};
ws.onmessage = async (event) => {
try {
const data = JSON.parse(event.data);
const { type, msg } = data;
switch (type) {
case "initial":
socketRef.current = ws;
setSocketConnected(true);
resolve();
break;
case "media":
const timestamp = Date.now();
bufferQueueRef.current.push(msg);
if (!playingRef.current && bufferQueueRef.current.length > 0) {
handlePlay(timestamp);
}
break;
case "info":
toast({
variant: "destructive",
title: "Error",
description: msg,
});
break;
case "thinking":
dispatch(setThinking(true));
break;
case "transcribing":
dispatch(setTranscribing(true));
break;
case "stop_transcribing":
dispatch(setTranscribing(false));
break;
case "connected":
dispatch(setConnected(true));
break;
case "ready":
isGrpcRef.current = true;
startAudioStream();
break;
case "pause":
if (sourceRef.current) {
sourceRef.current.onended = null;
sourceRef.current.stop();
sourceRef.current.disconnect();
sourceRef.current = null;
}
playingRef.current = false;
setGenerating(true);
setIsPlaying(false);
break;
case "continue":
if (lastShiftedRef.current) {
bufferQueueRef.current.unshift(lastShiftedRef.current);
lastShiftedRef.current = null;
}
setGenerating(false);
const currentTimestamp = Date.now();
handlePlay(currentTimestamp);
if ((socketRef.current as any).interval) {
clearInterval((socketRef.current as any).interval);
(socketRef.current as any).interval = null;
}
break;
case "clear":
bufferQueueRef.current = [];
playingRef.current = false;
setGenerating(false);
setIsPlaying(false);
if (sourceRef.current) {
sourceRef.current.onended = null;
sourceRef.current.stop();
sourceRef.current.disconnect();
sourceRef.current = null;
}
break;
case "end":
try {
if (audioContext) {
audioContext
.close()
.then(() => {
setAudioContextState(false);
if (isListening) {
toast({
variant: "destructive",
title: "Connection closed",
description: "Please restart the conversation.",
});
}
})
.catch(() => {
if (isListening) {
toast({
variant: "destructive",
title: "Error",
description: "Please restart the conversation.",
});
}
});
}
stopAudioStream();
dispatch(setIsListening(false));
} catch (error) {
console.error("Error in closing audioContext.");
}
break;
case "chat":
if (msg && activeChat) {
if (msg.role && msg.content) {
const messageType =
msg.role.toLowerCase() === "user" ? "user" : "assistant";
dispatch(
addMessage({
role: messageType,
content: msg.content,
})
);
}
// dispatch(updateChatTimestamp(activeChat));
}
break;
case "chathistory":
console.info("Chathistory");
break;
default:
break;
}
} catch (error) {
console.error("Error in websocket message handling:", error);
}
};
ws.onclose = async () => {
try {
if (audioStream) {
audioStream.getTracks().forEach((track) => track.stop());
setAudioStream(null);
}
setElapsedTime(0);
} catch (err) {
console.error("Error in closing audio stream:", err);
}
};
ws.onerror = (err) => {
console.error("WebSocket Error:", err);
reject(err);
};
} catch (err) {
console.error("Error in making WebSocket connection:", err);
reject(err);
}
});
};
const startAudioStream = async (): Promise<void> => {
try {
const startTime = Date.now();
setLatency(0);
const interval = setInterval(() => {
setElapsedTime(Date.now() - startTime);
}, 1000);
dispatch(setThinking(false));
if (!socketRef.current) {
toast({
variant: "destructive",
title: "Connection Error",
description: "Please try again. Socket not connected.",
});
return;
}
audioContextRef.current = new (window.AudioContext ||
(window as any).webkitAudioContext)();
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
setAudioStream(stream);
const newAudioContext = new AudioContext({
sampleRate: 16000,
});
setAudioContext(newAudioContext);
const audioInput = newAudioContext.createMediaStreamSource(stream);
const bufferSize = 512;
const scriptProcessorNode = newAudioContext.createScriptProcessor(
bufferSize,
1,
1
);
scriptProcessorNode.onaudioprocess = async (e) => {
const inputData = e.inputBuffer.getChannelData(0);
// const l16Data = inputData;
const l16Data = floatTo16BitPCM(inputData);
try {
if (!isGrpcRef.current) {
try {
if (audioContextState && newAudioContext) {
newAudioContext
.close()
.then(() => {
setAudioContextState(false);
toast({
variant: "destructive",
title: "Connection Error",
description: "Please restart the conversation.",
});
})
.catch(() => {
toast({
variant: "destructive",
title: "Connection Error",
description: "Please restart the conversation.",
});
});
await stopAudioStream();
dispatch(setIsListening(false));
}
} catch (error) {
console.error("Error in closing audioContext:", error);
}
}
if (
isGrpcRef.current &&
socketRef.current &&
socketRef.current.readyState === WebSocket.OPEN
) {
socketRef.current.send(l16Data);
}
} catch (err) {
console.error("Error in sending buffer:", err);
}
};
audioInput.connect(scriptProcessorNode);
scriptProcessorNode.connect(newAudioContext.destination);
} catch (error) {
console.error("Error accessing microphone:", error);
toast({
variant: "destructive",
title: "Microphone Error",
description:
"Could not access your microphone. Please check your permissions.",
});
}
};
const stopAudioStream = async (): Promise<void> => {
setGenerating(false);
setLatency(0);
dispatch(setThinking(false));
dispatch(setConnected(false));
dispatch(setIsListening(false));
setElapsedTime(0);
bufferQueueRef.current = [];
if (socketRef.current && socketRef.current.readyState === WebSocket.OPEN) {
if ((socketRef.current as any).interval) {
clearInterval((socketRef.current as any).interval);
(socketRef.current as any).interval = null;
}
if (audioStream) {
try {
audioStream.getTracks().forEach((track) => track.stop());
setAudioStream(null);
} catch (err) {
console.error("Error stopping audio stream:", err);
}
}
try {
isGrpcRef.current = false;
socketRef.current.send(JSON.stringify({ type: "stop", msg: "stop" }));
socketRef.current.close();
socketRef.current = null;
} catch (err) {
console.error("Error closing WebSocket:", err);
}
try {
if (audioContext) {
await audioContext.close();
setAudioContext(null);
}
} catch (err) {
console.error("Error closing AudioContext:", err);
}
}
};
const handleVoiceChatToggle = async (): Promise<void> => {
if (!activeChat) {
toast({
variant: "destructive",
title: "Error",
description: "Please select a chat or create one",
});
return;
}
if (!isListening) {
setLatency(0);
dispatch(setIsListening(true));
try {
await connectToRealtimeTTS();
if (socketRef.current) {
socketRef.current.send(
JSON.stringify({
type: "start",
msg: JSON.stringify({
temperature: temperature,
silenceDuration: silenceDuration,
activeVoice: activeVoice,
threshold: threshold,
sessionId: activeChat,
maxTokens: maxTokens,
}),
})
);
}
} catch (error) {
console.error("Failed to start voice chat:", error);
dispatch(setIsListening(false));
toast({
variant: "destructive",
title: "Connection Error",
description: "Failed to start voice chat. Please try again.",
});
}
} else {
socketRef.current.close();
await stopAudioStream();
socketRef.current = null;
}
};
return (
<div className="flex flex-col items-center w-full max-w-md mx-auto">
<div className="w-full flex justify-between items-center mb-4">
{/* <div className={`text-sm text-white ${!isListening && "m-auto"}`}>
{isListening ? `Active: ${formatTime(elapsedTime)}` : "Ready"}
</div> */}
{isListening && (
<div className="text-sm text-white/60">Latency: {latency}ms</div>
)}
{/* <div className="relative mb-4">{isListening && <VoiceVisualizer />}</div> */}
{/* <LottieMicAnimation isListening={isListening} /> */}
{transcribing && (
<div className="flex gap-2 text-white">
<div>
<Loader2 className="animate-spin" />
</div>
<div>Transcribing</div>
</div>
)}
{thinking && (
<div className="flex gap-2 text-white">
<div>
<Loader2 className="animate-spin" />
</div>
<div>Thinking</div>
</div>
)}
</div>
<Button
onClick={handleVoiceChatToggle}
className={`w-16 h-16 rounded-full transition-all duration-300 ${
isListening
? "bg-red-500 hover:bg-red-600"
: "bg-emerald-500 hover:bg-emerald-600"
}`}
// disabled={generating || isPlaying}
>
{isListening ? (
!connected ? (
<Loader2 className="animate-spin" />
) : (
<MicOff className="h-6 w-6" />
)
) : generating ? (
<Loader2 className="h-6 w-6 animate-spin" />
) : (
<Mic className="h-6 w-6" />
)}
</Button>
<p className="text-white/70 mt-3 text-sm">
{isListening ? "Click to stop listening" : "Click to start voice chat"}
</p>
</div>
);
};
export default VoiceChat;