VocRT / frontend /src /components /VoiceChat.tsx

Anurag

added threshold option and fixed some bugs

f1a245b 3 months ago

20.2 kB

	import React, { useRef, useState } from "react";
	import { Mic, MicOff, Loader2 } from "lucide-react";
	import { Button } from "@/components/ui/button";
	import { useToast } from "@/hooks/use-toast";
	import { useAppDispatch, useAppSelector } from "@/redux/hooks";
	import { addMessage } from "@/redux/slices/chatSlice";
	import { setIsListening } from "@/redux/slices/chatSlice";
	import {
	setConnected,
	setThinking,
	setTranscribing,
	} from "@/redux/slices/sessionSlice";

	interface MessageMetadata {
	session_id: string;
	sequence_id: string;
	transcript: string;
	}

	const VoiceChat = () => {
	// WebSocket and audio state
	const [socketConnected, setSocketConnected] = useState(false);
	const [audioStream, setAudioStream] = useState<MediaStream \| null>(null);
	const [audioContext, setAudioContext] = useState<AudioContext \| null>(null);
	const [audioContextState, setAudioContextState] = useState(false);
	const [elapsedTime, setElapsedTime] = useState(0);
	const [latency, setLatency] = useState(0);
	const [generating, setGenerating] = useState(false);
	const [isPlaying, setIsPlaying] = useState(false);

	const { sessionId, transcribing, thinking, connected } = useAppSelector(
	(state) => state.session
	);

	// App state management with Redux
	const { toast } = useToast();
	const dispatch = useAppDispatch();
	const { activeChat } = useAppSelector((state) => state.chats);
	const isListening = useAppSelector((state) => state.chat.isListening);
	const { temperature, activeVoice, maxTokens, threshold, silenceDuration } =
	useAppSelector((state) => state.settings);

	// Refs for audio processing
	const socketRef = useRef<WebSocket \| null>(null);
	const sourceRef = useRef<AudioBufferSourceNode \| null>(null);
	const audioContextRef = useRef<AudioContext \| null>(null);
	const playingRef = useRef<boolean>(false);
	const lastShiftedRef = useRef<string \| null>(null);
	const bufferQueueRef = useRef<string[]>([]);
	const isGrpcRef = useRef<boolean \| null>(null);

	const formatTime = (milliseconds: number): string => {
	const totalSeconds = Math.floor(milliseconds / 1000);
	const hours = Math.floor(totalSeconds / 3600);
	const minutes = Math.floor((totalSeconds % 3600) / 60);
	const seconds = totalSeconds % 60;

	return `${String(hours).padStart(2, "0")}:${String(minutes).padStart(
	2,
	"0"
	)}:${String(seconds).padStart(2, "0")}`;
	};

	const floatTo16BitPCM = (input: Float32Array): ArrayBuffer => {
	const output = new Int16Array(input.length);
	for (let i = 0; i < input.length; i++) {
	const sample = Math.max(-1, Math.min(1, input[i]));
	output[i] = sample < 0 ? sample * 0x8000 : sample * 0x7fff;
	}
	return output.buffer;
	};

	const handlePlay = async (timestamp: number): Promise<void> => {
	try {
	if (!isGrpcRef.current) {
	return;
	}

	playingRef.current = true;
	setIsPlaying(true);

	const base64Data = bufferQueueRef.current.shift();
	if (!base64Data) {
	playingRef.current = false;
	setIsPlaying(false);
	return;
	}

	lastShiftedRef.current = base64Data;

	const bytes = new Uint8Array(
	atob(base64Data)
	.split("")
	.map((char) => char.charCodeAt(0))
	);

	let arrayBuffer = bytes.buffer;

	const metadataEndIndex = bytes.indexOf(0);
	const metadataStr = new TextDecoder().decode(
	bytes.slice(0, metadataEndIndex)
	);
	const metadata = JSON.parse(metadataStr) as MessageMetadata;

	const { session_id, sequence_id, transcript } = metadata;

	if (sequence_id !== "-2") {
	if (socketRef.current && (socketRef.current as any).interval) {
	clearInterval((socketRef.current as any).interval);
	(socketRef.current as any).interval = null;
	setLatency((prev) => prev + 150);
	dispatch(setThinking(false));
	}

	arrayBuffer = arrayBuffer.slice(metadataEndIndex + 1);

	try {
	if (audioContextRef.current?.state === "suspended") {
	await audioContextRef.current.resume();
	}

	if (!audioContextRef.current) {
	return;
	}

	const audioBuffer = await audioContextRef.current.decodeAudioData(
	arrayBuffer
	);

	if (sourceRef.current) {
	sourceRef.current.disconnect();
	}

	const source = audioContextRef.current.createBufferSource();
	source.buffer = audioBuffer;
	source.connect(audioContextRef.current.destination);
	source.start(0);

	sourceRef.current = source;
	(sourceRef.current as any).session_id = session_id;
	(sourceRef.current as any).sequence_id = sequence_id;
	(sourceRef.current as any).transcript = transcript;

	sourceRef.current.onended = () => {
	lastShiftedRef.current = null;

	if (
	socketRef.current?.readyState === WebSocket.OPEN &&
	(sourceRef.current as any)?.sequence_id
	) {
	socketRef.current.send(
	JSON.stringify({
	type: "status",
	msg: {
	session_id: (sourceRef.current as any)?.session_id,
	sequence_id: (sourceRef.current as any)?.sequence_id,
	transcript: (sourceRef.current as any)?.transcript,
	},
	})
	);
	}

	if (bufferQueueRef.current.length > 0) {
	playingRef.current = true;
	setIsPlaying(true);
	const currentTimestamp = Date.now();
	handlePlay(currentTimestamp);
	} else {
	playingRef.current = false;
	setIsPlaying(false);
	}
	};
	} catch (error) {
	console.error("Error decoding audio data:", error);
	}
	} else {
	const startTime = Date.now();
	const interval = setInterval(() => {
	setLatency(Date.now() - startTime);
	}, 10);

	(socketRef.current as any).interval = interval;

	if (bufferQueueRef.current.length > 0) {
	playingRef.current = true;
	setIsPlaying(true);
	const currentTimestamp = Date.now();
	handlePlay(currentTimestamp);
	} else {
	playingRef.current = false;
	setIsPlaying(false);
	}
	}
	} catch (error) {
	console.error("Error in handlePlay: ", error);
	}
	};

	const connectToRealtimeTTS = async (): Promise<void> => {
	return new Promise<void>((resolve, reject) => {
	try {
	const newAudioContext = new (window.AudioContext \|\|
	(window as any).webkitAudioContext)();
	audioContextRef.current = newAudioContext;
	setAudioContextState(true);

	let websocketURL = import.meta.env.VITE_WEBSOCKET_URL;

	websocketURL = import.meta.env.VITE_WEBSOCKET_URL.includes("localhost")
	? `ws://${import.meta.env.VITE_WEBSOCKET_URL}/v2v`
	: `wss://${import.meta.env.VITE_WEBSOCKET_URL}/v2v`;

	const ws = new WebSocket(websocketURL);

	ws.onopen = () => {
	// Initial connection established
	};

	ws.onmessage = async (event) => {
	try {
	const data = JSON.parse(event.data);
	const { type, msg } = data;

	switch (type) {
	case "initial":
	socketRef.current = ws;
	setSocketConnected(true);
	resolve();
	break;
	case "media":
	const timestamp = Date.now();
	bufferQueueRef.current.push(msg);

	if (!playingRef.current && bufferQueueRef.current.length > 0) {
	handlePlay(timestamp);
	}
	break;
	case "info":
	toast({
	variant: "destructive",
	title: "Error",
	description: msg,
	});
	break;
	case "thinking":
	dispatch(setThinking(true));
	break;
	case "transcribing":
	dispatch(setTranscribing(true));
	break;
	case "stop_transcribing":
	dispatch(setTranscribing(false));
	break;
	case "connected":
	dispatch(setConnected(true));
	break;
	case "ready":
	isGrpcRef.current = true;
	startAudioStream();
	break;
	case "pause":
	if (sourceRef.current) {
	sourceRef.current.onended = null;
	sourceRef.current.stop();
	sourceRef.current.disconnect();
	sourceRef.current = null;
	}
	playingRef.current = false;
	setGenerating(true);
	setIsPlaying(false);
	break;
	case "continue":
	if (lastShiftedRef.current) {
	bufferQueueRef.current.unshift(lastShiftedRef.current);
	lastShiftedRef.current = null;
	}
	setGenerating(false);
	const currentTimestamp = Date.now();
	handlePlay(currentTimestamp);
	if ((socketRef.current as any).interval) {
	clearInterval((socketRef.current as any).interval);
	(socketRef.current as any).interval = null;
	}
	break;
	case "clear":
	bufferQueueRef.current = [];
	playingRef.current = false;
	setGenerating(false);
	setIsPlaying(false);

	if (sourceRef.current) {
	sourceRef.current.onended = null;
	sourceRef.current.stop();
	sourceRef.current.disconnect();
	sourceRef.current = null;
	}
	break;
	case "end":
	try {
	if (audioContext) {
	audioContext
	.close()
	.then(() => {
	setAudioContextState(false);
	if (isListening) {
	toast({
	variant: "destructive",
	title: "Connection closed",
	description: "Please restart the conversation.",
	});
	}
	})
	.catch(() => {
	if (isListening) {
	toast({
	variant: "destructive",
	title: "Error",
	description: "Please restart the conversation.",
	});
	}
	});
	}
	stopAudioStream();
	dispatch(setIsListening(false));
	} catch (error) {
	console.error("Error in closing audioContext.");
	}
	break;
	case "chat":
	if (msg && activeChat) {
	if (msg.role && msg.content) {
	const messageType =
	msg.role.toLowerCase() === "user" ? "user" : "assistant";
	dispatch(
	addMessage({
	role: messageType,
	content: msg.content,
	})
	);
	}
	// dispatch(updateChatTimestamp(activeChat));
	}
	break;
	case "chathistory":
	console.info("Chathistory");
	break;
	default:
	break;
	}
	} catch (error) {
	console.error("Error in websocket message handling:", error);
	}
	};

	ws.onclose = async () => {
	try {
	if (audioStream) {
	audioStream.getTracks().forEach((track) => track.stop());
	setAudioStream(null);
	}
	setElapsedTime(0);
	} catch (err) {
	console.error("Error in closing audio stream:", err);
	}
	};

	ws.onerror = (err) => {
	console.error("WebSocket Error:", err);
	reject(err);
	};
	} catch (err) {
	console.error("Error in making WebSocket connection:", err);
	reject(err);
	}
	});
	};

	const startAudioStream = async (): Promise<void> => {
	try {
	const startTime = Date.now();
	setLatency(0);
	const interval = setInterval(() => {
	setElapsedTime(Date.now() - startTime);
	}, 1000);
	dispatch(setThinking(false));

	if (!socketRef.current) {
	toast({
	variant: "destructive",
	title: "Connection Error",
	description: "Please try again. Socket not connected.",
	});
	return;
	}

	audioContextRef.current = new (window.AudioContext \|\|
	(window as any).webkitAudioContext)();

	const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
	setAudioStream(stream);

	const newAudioContext = new AudioContext({
	sampleRate: 16000,
	});

	setAudioContext(newAudioContext);
	const audioInput = newAudioContext.createMediaStreamSource(stream);
	const bufferSize = 512;
	const scriptProcessorNode = newAudioContext.createScriptProcessor(
	bufferSize,
	1,
	1
	);

	scriptProcessorNode.onaudioprocess = async (e) => {
	const inputData = e.inputBuffer.getChannelData(0);
	// const l16Data = inputData;
	const l16Data = floatTo16BitPCM(inputData);
	try {
	if (!isGrpcRef.current) {
	try {
	if (audioContextState && newAudioContext) {
	newAudioContext
	.close()
	.then(() => {
	setAudioContextState(false);
	toast({
	variant: "destructive",
	title: "Connection Error",
	description: "Please restart the conversation.",
	});
	})
	.catch(() => {
	toast({
	variant: "destructive",
	title: "Connection Error",
	description: "Please restart the conversation.",
	});
	});
	await stopAudioStream();
	dispatch(setIsListening(false));
	}
	} catch (error) {
	console.error("Error in closing audioContext:", error);
	}
	}

	if (
	isGrpcRef.current &&
	socketRef.current &&
	socketRef.current.readyState === WebSocket.OPEN
	) {
	socketRef.current.send(l16Data);
	}
	} catch (err) {
	console.error("Error in sending buffer:", err);
	}
	};

	audioInput.connect(scriptProcessorNode);
	scriptProcessorNode.connect(newAudioContext.destination);
	} catch (error) {
	console.error("Error accessing microphone:", error);
	toast({
	variant: "destructive",
	title: "Microphone Error",
	description:
	"Could not access your microphone. Please check your permissions.",
	});
	}
	};

	const stopAudioStream = async (): Promise<void> => {
	setGenerating(false);
	setLatency(0);
	dispatch(setThinking(false));
	dispatch(setConnected(false));
	dispatch(setIsListening(false));
	setElapsedTime(0);
	bufferQueueRef.current = [];

	if (socketRef.current && socketRef.current.readyState === WebSocket.OPEN) {
	if ((socketRef.current as any).interval) {
	clearInterval((socketRef.current as any).interval);
	(socketRef.current as any).interval = null;
	}

	if (audioStream) {
	try {
	audioStream.getTracks().forEach((track) => track.stop());
	setAudioStream(null);
	} catch (err) {
	console.error("Error stopping audio stream:", err);
	}
	}

	try {
	isGrpcRef.current = false;
	socketRef.current.send(JSON.stringify({ type: "stop", msg: "stop" }));
	socketRef.current.close();
	socketRef.current = null;
	} catch (err) {
	console.error("Error closing WebSocket:", err);
	}

	try {
	if (audioContext) {
	await audioContext.close();
	setAudioContext(null);
	}
	} catch (err) {
	console.error("Error closing AudioContext:", err);
	}
	}
	};

	const handleVoiceChatToggle = async (): Promise<void> => {
	if (!activeChat) {
	toast({
	variant: "destructive",
	title: "Error",
	description: "Please select a chat or create one",
	});
	return;
	}
	if (!isListening) {
	setLatency(0);
	dispatch(setIsListening(true));
	try {
	await connectToRealtimeTTS();
	if (socketRef.current) {
	socketRef.current.send(
	JSON.stringify({
	type: "start",
	msg: JSON.stringify({
	temperature: temperature,
	silenceDuration: silenceDuration,
	activeVoice: activeVoice,
	threshold: threshold,
	sessionId: activeChat,
	maxTokens: maxTokens,
	}),
	})
	);
	}
	} catch (error) {
	console.error("Failed to start voice chat:", error);
	dispatch(setIsListening(false));
	toast({
	variant: "destructive",
	title: "Connection Error",
	description: "Failed to start voice chat. Please try again.",
	});
	}
	} else {
	socketRef.current.close();
	await stopAudioStream();
	socketRef.current = null;
	}
	};

	return (
	<div className="flex flex-col items-center w-full max-w-md mx-auto">
	<div className="w-full flex justify-between items-center mb-4">
	{/* <div className={`text-sm text-white ${!isListening && "m-auto"}`}>
	{isListening ? `Active: ${formatTime(elapsedTime)}` : "Ready"}
	</div> */}
	{isListening && (
	<div className="text-sm text-white/60">Latency: {latency}ms</div>
	)}

	{/* <div className="relative mb-4">{isListening && <VoiceVisualizer />}</div> */}

	{/* <LottieMicAnimation isListening={isListening} /> */}

	{transcribing && (
	<div className="flex gap-2 text-white">
	<div>
	<Loader2 className="animate-spin" />
	</div>
	<div>Transcribing</div>
	</div>
	)}
	{thinking && (
	<div className="flex gap-2 text-white">
	<div>
	<Loader2 className="animate-spin" />
	</div>
	<div>Thinking</div>
	</div>
	)}
	</div>

	<Button
	onClick={handleVoiceChatToggle}
	className={`w-16 h-16 rounded-full transition-all duration-300 ${
	isListening
	? "bg-red-500 hover:bg-red-600"
	: "bg-emerald-500 hover:bg-emerald-600"
	}`}
	// disabled={generating \|\| isPlaying}
	>
	{isListening ? (
	!connected ? (
	<Loader2 className="animate-spin" />
	) : (
	<MicOff className="h-6 w-6" />
	)
	) : generating ? (
	<Loader2 className="h-6 w-6 animate-spin" />
	) : (
	<Mic className="h-6 w-6" />
	)}
	</Button>

	<p className="text-white/70 mt-3 text-sm">
	{isListening ? "Click to stop listening" : "Click to start voice chat"}
	</p>
	</div>
	);
	};

	export default VoiceChat;