|
import React, { useRef, useState } from "react"; |
|
import { Mic, MicOff, Loader2 } from "lucide-react"; |
|
import { Button } from "@/components/ui/button"; |
|
import { useToast } from "@/hooks/use-toast"; |
|
import { useAppDispatch, useAppSelector } from "@/redux/hooks"; |
|
import { addMessage } from "@/redux/slices/chatSlice"; |
|
import { setIsListening } from "@/redux/slices/chatSlice"; |
|
import { |
|
setConnected, |
|
setThinking, |
|
setTranscribing, |
|
} from "@/redux/slices/sessionSlice"; |
|
|
|
interface MessageMetadata { |
|
session_id: string; |
|
sequence_id: string; |
|
transcript: string; |
|
} |
|
|
|
const VoiceChat = () => { |
|
|
|
const [socketConnected, setSocketConnected] = useState(false); |
|
const [audioStream, setAudioStream] = useState<MediaStream | null>(null); |
|
const [audioContext, setAudioContext] = useState<AudioContext | null>(null); |
|
const [audioContextState, setAudioContextState] = useState(false); |
|
const [elapsedTime, setElapsedTime] = useState(0); |
|
const [latency, setLatency] = useState(0); |
|
const [generating, setGenerating] = useState(false); |
|
const [isPlaying, setIsPlaying] = useState(false); |
|
|
|
const { sessionId, transcribing, thinking, connected } = useAppSelector( |
|
(state) => state.session |
|
); |
|
|
|
|
|
const { toast } = useToast(); |
|
const dispatch = useAppDispatch(); |
|
const { activeChat } = useAppSelector((state) => state.chats); |
|
const isListening = useAppSelector((state) => state.chat.isListening); |
|
const { temperature, activeVoice, maxTokens, threshold, silenceDuration } = |
|
useAppSelector((state) => state.settings); |
|
|
|
|
|
const socketRef = useRef<WebSocket | null>(null); |
|
const sourceRef = useRef<AudioBufferSourceNode | null>(null); |
|
const audioContextRef = useRef<AudioContext | null>(null); |
|
const playingRef = useRef<boolean>(false); |
|
const lastShiftedRef = useRef<string | null>(null); |
|
const bufferQueueRef = useRef<string[]>([]); |
|
const isGrpcRef = useRef<boolean | null>(null); |
|
|
|
const formatTime = (milliseconds: number): string => { |
|
const totalSeconds = Math.floor(milliseconds / 1000); |
|
const hours = Math.floor(totalSeconds / 3600); |
|
const minutes = Math.floor((totalSeconds % 3600) / 60); |
|
const seconds = totalSeconds % 60; |
|
|
|
return `${String(hours).padStart(2, "0")}:${String(minutes).padStart( |
|
2, |
|
"0" |
|
)}:${String(seconds).padStart(2, "0")}`; |
|
}; |
|
|
|
const floatTo16BitPCM = (input: Float32Array): ArrayBuffer => { |
|
const output = new Int16Array(input.length); |
|
for (let i = 0; i < input.length; i++) { |
|
const sample = Math.max(-1, Math.min(1, input[i])); |
|
output[i] = sample < 0 ? sample * 0x8000 : sample * 0x7fff; |
|
} |
|
return output.buffer; |
|
}; |
|
|
|
const handlePlay = async (timestamp: number): Promise<void> => { |
|
try { |
|
if (!isGrpcRef.current) { |
|
return; |
|
} |
|
|
|
playingRef.current = true; |
|
setIsPlaying(true); |
|
|
|
const base64Data = bufferQueueRef.current.shift(); |
|
if (!base64Data) { |
|
playingRef.current = false; |
|
setIsPlaying(false); |
|
return; |
|
} |
|
|
|
lastShiftedRef.current = base64Data; |
|
|
|
const bytes = new Uint8Array( |
|
atob(base64Data) |
|
.split("") |
|
.map((char) => char.charCodeAt(0)) |
|
); |
|
|
|
let arrayBuffer = bytes.buffer; |
|
|
|
const metadataEndIndex = bytes.indexOf(0); |
|
const metadataStr = new TextDecoder().decode( |
|
bytes.slice(0, metadataEndIndex) |
|
); |
|
const metadata = JSON.parse(metadataStr) as MessageMetadata; |
|
|
|
const { session_id, sequence_id, transcript } = metadata; |
|
|
|
if (sequence_id !== "-2") { |
|
if (socketRef.current && (socketRef.current as any).interval) { |
|
clearInterval((socketRef.current as any).interval); |
|
(socketRef.current as any).interval = null; |
|
setLatency((prev) => prev + 150); |
|
dispatch(setThinking(false)); |
|
} |
|
|
|
arrayBuffer = arrayBuffer.slice(metadataEndIndex + 1); |
|
|
|
try { |
|
if (audioContextRef.current?.state === "suspended") { |
|
await audioContextRef.current.resume(); |
|
} |
|
|
|
if (!audioContextRef.current) { |
|
return; |
|
} |
|
|
|
const audioBuffer = await audioContextRef.current.decodeAudioData( |
|
arrayBuffer |
|
); |
|
|
|
if (sourceRef.current) { |
|
sourceRef.current.disconnect(); |
|
} |
|
|
|
const source = audioContextRef.current.createBufferSource(); |
|
source.buffer = audioBuffer; |
|
source.connect(audioContextRef.current.destination); |
|
source.start(0); |
|
|
|
sourceRef.current = source; |
|
(sourceRef.current as any).session_id = session_id; |
|
(sourceRef.current as any).sequence_id = sequence_id; |
|
(sourceRef.current as any).transcript = transcript; |
|
|
|
sourceRef.current.onended = () => { |
|
lastShiftedRef.current = null; |
|
|
|
if ( |
|
socketRef.current?.readyState === WebSocket.OPEN && |
|
(sourceRef.current as any)?.sequence_id |
|
) { |
|
socketRef.current.send( |
|
JSON.stringify({ |
|
type: "status", |
|
msg: { |
|
session_id: (sourceRef.current as any)?.session_id, |
|
sequence_id: (sourceRef.current as any)?.sequence_id, |
|
transcript: (sourceRef.current as any)?.transcript, |
|
}, |
|
}) |
|
); |
|
} |
|
|
|
if (bufferQueueRef.current.length > 0) { |
|
playingRef.current = true; |
|
setIsPlaying(true); |
|
const currentTimestamp = Date.now(); |
|
handlePlay(currentTimestamp); |
|
} else { |
|
playingRef.current = false; |
|
setIsPlaying(false); |
|
} |
|
}; |
|
} catch (error) { |
|
console.error("Error decoding audio data:", error); |
|
} |
|
} else { |
|
const startTime = Date.now(); |
|
const interval = setInterval(() => { |
|
setLatency(Date.now() - startTime); |
|
}, 10); |
|
|
|
(socketRef.current as any).interval = interval; |
|
|
|
if (bufferQueueRef.current.length > 0) { |
|
playingRef.current = true; |
|
setIsPlaying(true); |
|
const currentTimestamp = Date.now(); |
|
handlePlay(currentTimestamp); |
|
} else { |
|
playingRef.current = false; |
|
setIsPlaying(false); |
|
} |
|
} |
|
} catch (error) { |
|
console.error("Error in handlePlay: ", error); |
|
} |
|
}; |
|
|
|
const connectToRealtimeTTS = async (): Promise<void> => { |
|
return new Promise<void>((resolve, reject) => { |
|
try { |
|
const newAudioContext = new (window.AudioContext || |
|
(window as any).webkitAudioContext)(); |
|
audioContextRef.current = newAudioContext; |
|
setAudioContextState(true); |
|
|
|
let websocketURL = import.meta.env.VITE_WEBSOCKET_URL; |
|
|
|
websocketURL = import.meta.env.VITE_WEBSOCKET_URL.includes("localhost") |
|
? `ws://${import.meta.env.VITE_WEBSOCKET_URL}/v2v` |
|
: `wss://${import.meta.env.VITE_WEBSOCKET_URL}/v2v`; |
|
|
|
const ws = new WebSocket(websocketURL); |
|
|
|
ws.onopen = () => { |
|
|
|
}; |
|
|
|
ws.onmessage = async (event) => { |
|
try { |
|
const data = JSON.parse(event.data); |
|
const { type, msg } = data; |
|
|
|
switch (type) { |
|
case "initial": |
|
socketRef.current = ws; |
|
setSocketConnected(true); |
|
resolve(); |
|
break; |
|
case "media": |
|
const timestamp = Date.now(); |
|
bufferQueueRef.current.push(msg); |
|
|
|
if (!playingRef.current && bufferQueueRef.current.length > 0) { |
|
handlePlay(timestamp); |
|
} |
|
break; |
|
case "info": |
|
toast({ |
|
variant: "destructive", |
|
title: "Error", |
|
description: msg, |
|
}); |
|
break; |
|
case "thinking": |
|
dispatch(setThinking(true)); |
|
break; |
|
case "transcribing": |
|
dispatch(setTranscribing(true)); |
|
break; |
|
case "stop_transcribing": |
|
dispatch(setTranscribing(false)); |
|
break; |
|
case "connected": |
|
dispatch(setConnected(true)); |
|
break; |
|
case "ready": |
|
isGrpcRef.current = true; |
|
startAudioStream(); |
|
break; |
|
case "pause": |
|
if (sourceRef.current) { |
|
sourceRef.current.onended = null; |
|
sourceRef.current.stop(); |
|
sourceRef.current.disconnect(); |
|
sourceRef.current = null; |
|
} |
|
playingRef.current = false; |
|
setGenerating(true); |
|
setIsPlaying(false); |
|
break; |
|
case "continue": |
|
if (lastShiftedRef.current) { |
|
bufferQueueRef.current.unshift(lastShiftedRef.current); |
|
lastShiftedRef.current = null; |
|
} |
|
setGenerating(false); |
|
const currentTimestamp = Date.now(); |
|
handlePlay(currentTimestamp); |
|
if ((socketRef.current as any).interval) { |
|
clearInterval((socketRef.current as any).interval); |
|
(socketRef.current as any).interval = null; |
|
} |
|
break; |
|
case "clear": |
|
bufferQueueRef.current = []; |
|
playingRef.current = false; |
|
setGenerating(false); |
|
setIsPlaying(false); |
|
|
|
if (sourceRef.current) { |
|
sourceRef.current.onended = null; |
|
sourceRef.current.stop(); |
|
sourceRef.current.disconnect(); |
|
sourceRef.current = null; |
|
} |
|
break; |
|
case "end": |
|
try { |
|
if (audioContext) { |
|
audioContext |
|
.close() |
|
.then(() => { |
|
setAudioContextState(false); |
|
if (isListening) { |
|
toast({ |
|
variant: "destructive", |
|
title: "Connection closed", |
|
description: "Please restart the conversation.", |
|
}); |
|
} |
|
}) |
|
.catch(() => { |
|
if (isListening) { |
|
toast({ |
|
variant: "destructive", |
|
title: "Error", |
|
description: "Please restart the conversation.", |
|
}); |
|
} |
|
}); |
|
} |
|
stopAudioStream(); |
|
dispatch(setIsListening(false)); |
|
} catch (error) { |
|
console.error("Error in closing audioContext."); |
|
} |
|
break; |
|
case "chat": |
|
if (msg && activeChat) { |
|
if (msg.role && msg.content) { |
|
const messageType = |
|
msg.role.toLowerCase() === "user" ? "user" : "assistant"; |
|
dispatch( |
|
addMessage({ |
|
role: messageType, |
|
content: msg.content, |
|
}) |
|
); |
|
} |
|
|
|
} |
|
break; |
|
case "chathistory": |
|
console.info("Chathistory"); |
|
break; |
|
default: |
|
break; |
|
} |
|
} catch (error) { |
|
console.error("Error in websocket message handling:", error); |
|
} |
|
}; |
|
|
|
ws.onclose = async () => { |
|
try { |
|
if (audioStream) { |
|
audioStream.getTracks().forEach((track) => track.stop()); |
|
setAudioStream(null); |
|
} |
|
setElapsedTime(0); |
|
} catch (err) { |
|
console.error("Error in closing audio stream:", err); |
|
} |
|
}; |
|
|
|
ws.onerror = (err) => { |
|
console.error("WebSocket Error:", err); |
|
reject(err); |
|
}; |
|
} catch (err) { |
|
console.error("Error in making WebSocket connection:", err); |
|
reject(err); |
|
} |
|
}); |
|
}; |
|
|
|
const startAudioStream = async (): Promise<void> => { |
|
try { |
|
const startTime = Date.now(); |
|
setLatency(0); |
|
const interval = setInterval(() => { |
|
setElapsedTime(Date.now() - startTime); |
|
}, 1000); |
|
dispatch(setThinking(false)); |
|
|
|
if (!socketRef.current) { |
|
toast({ |
|
variant: "destructive", |
|
title: "Connection Error", |
|
description: "Please try again. Socket not connected.", |
|
}); |
|
return; |
|
} |
|
|
|
audioContextRef.current = new (window.AudioContext || |
|
(window as any).webkitAudioContext)(); |
|
|
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); |
|
setAudioStream(stream); |
|
|
|
const newAudioContext = new AudioContext({ |
|
sampleRate: 16000, |
|
}); |
|
|
|
setAudioContext(newAudioContext); |
|
const audioInput = newAudioContext.createMediaStreamSource(stream); |
|
const bufferSize = 512; |
|
const scriptProcessorNode = newAudioContext.createScriptProcessor( |
|
bufferSize, |
|
1, |
|
1 |
|
); |
|
|
|
scriptProcessorNode.onaudioprocess = async (e) => { |
|
const inputData = e.inputBuffer.getChannelData(0); |
|
|
|
const l16Data = floatTo16BitPCM(inputData); |
|
try { |
|
if (!isGrpcRef.current) { |
|
try { |
|
if (audioContextState && newAudioContext) { |
|
newAudioContext |
|
.close() |
|
.then(() => { |
|
setAudioContextState(false); |
|
toast({ |
|
variant: "destructive", |
|
title: "Connection Error", |
|
description: "Please restart the conversation.", |
|
}); |
|
}) |
|
.catch(() => { |
|
toast({ |
|
variant: "destructive", |
|
title: "Connection Error", |
|
description: "Please restart the conversation.", |
|
}); |
|
}); |
|
await stopAudioStream(); |
|
dispatch(setIsListening(false)); |
|
} |
|
} catch (error) { |
|
console.error("Error in closing audioContext:", error); |
|
} |
|
} |
|
|
|
if ( |
|
isGrpcRef.current && |
|
socketRef.current && |
|
socketRef.current.readyState === WebSocket.OPEN |
|
) { |
|
socketRef.current.send(l16Data); |
|
} |
|
} catch (err) { |
|
console.error("Error in sending buffer:", err); |
|
} |
|
}; |
|
|
|
audioInput.connect(scriptProcessorNode); |
|
scriptProcessorNode.connect(newAudioContext.destination); |
|
} catch (error) { |
|
console.error("Error accessing microphone:", error); |
|
toast({ |
|
variant: "destructive", |
|
title: "Microphone Error", |
|
description: |
|
"Could not access your microphone. Please check your permissions.", |
|
}); |
|
} |
|
}; |
|
|
|
const stopAudioStream = async (): Promise<void> => { |
|
setGenerating(false); |
|
setLatency(0); |
|
dispatch(setThinking(false)); |
|
dispatch(setConnected(false)); |
|
dispatch(setIsListening(false)); |
|
setElapsedTime(0); |
|
bufferQueueRef.current = []; |
|
|
|
if (socketRef.current && socketRef.current.readyState === WebSocket.OPEN) { |
|
if ((socketRef.current as any).interval) { |
|
clearInterval((socketRef.current as any).interval); |
|
(socketRef.current as any).interval = null; |
|
} |
|
|
|
if (audioStream) { |
|
try { |
|
audioStream.getTracks().forEach((track) => track.stop()); |
|
setAudioStream(null); |
|
} catch (err) { |
|
console.error("Error stopping audio stream:", err); |
|
} |
|
} |
|
|
|
try { |
|
isGrpcRef.current = false; |
|
socketRef.current.send(JSON.stringify({ type: "stop", msg: "stop" })); |
|
socketRef.current.close(); |
|
socketRef.current = null; |
|
} catch (err) { |
|
console.error("Error closing WebSocket:", err); |
|
} |
|
|
|
try { |
|
if (audioContext) { |
|
await audioContext.close(); |
|
setAudioContext(null); |
|
} |
|
} catch (err) { |
|
console.error("Error closing AudioContext:", err); |
|
} |
|
} |
|
}; |
|
|
|
const handleVoiceChatToggle = async (): Promise<void> => { |
|
if (!activeChat) { |
|
toast({ |
|
variant: "destructive", |
|
title: "Error", |
|
description: "Please select a chat or create one", |
|
}); |
|
return; |
|
} |
|
if (!isListening) { |
|
setLatency(0); |
|
dispatch(setIsListening(true)); |
|
try { |
|
await connectToRealtimeTTS(); |
|
if (socketRef.current) { |
|
socketRef.current.send( |
|
JSON.stringify({ |
|
type: "start", |
|
msg: JSON.stringify({ |
|
temperature: temperature, |
|
silenceDuration: silenceDuration, |
|
activeVoice: activeVoice, |
|
threshold: threshold, |
|
sessionId: activeChat, |
|
maxTokens: maxTokens, |
|
}), |
|
}) |
|
); |
|
} |
|
} catch (error) { |
|
console.error("Failed to start voice chat:", error); |
|
dispatch(setIsListening(false)); |
|
toast({ |
|
variant: "destructive", |
|
title: "Connection Error", |
|
description: "Failed to start voice chat. Please try again.", |
|
}); |
|
} |
|
} else { |
|
socketRef.current.close(); |
|
await stopAudioStream(); |
|
socketRef.current = null; |
|
} |
|
}; |
|
|
|
return ( |
|
<div className="flex flex-col items-center w-full max-w-md mx-auto"> |
|
<div className="w-full flex justify-between items-center mb-4"> |
|
{/* <div className={`text-sm text-white ${!isListening && "m-auto"}`}> |
|
{isListening ? `Active: ${formatTime(elapsedTime)}` : "Ready"} |
|
</div> */} |
|
{isListening && ( |
|
<div className="text-sm text-white/60">Latency: {latency}ms</div> |
|
)} |
|
|
|
{/* <div className="relative mb-4">{isListening && <VoiceVisualizer />}</div> */} |
|
|
|
{/* <LottieMicAnimation isListening={isListening} /> */} |
|
|
|
{transcribing && ( |
|
<div className="flex gap-2 text-white"> |
|
<div> |
|
<Loader2 className="animate-spin" /> |
|
</div> |
|
<div>Transcribing</div> |
|
</div> |
|
)} |
|
{thinking && ( |
|
<div className="flex gap-2 text-white"> |
|
<div> |
|
<Loader2 className="animate-spin" /> |
|
</div> |
|
<div>Thinking</div> |
|
</div> |
|
)} |
|
</div> |
|
|
|
<Button |
|
onClick={handleVoiceChatToggle} |
|
className={`w-16 h-16 rounded-full transition-all duration-300 ${ |
|
isListening |
|
? "bg-red-500 hover:bg-red-600" |
|
: "bg-emerald-500 hover:bg-emerald-600" |
|
}`} |
|
// disabled={generating || isPlaying} |
|
> |
|
{isListening ? ( |
|
!connected ? ( |
|
<Loader2 className="animate-spin" /> |
|
) : ( |
|
<MicOff className="h-6 w-6" /> |
|
) |
|
) : generating ? ( |
|
<Loader2 className="h-6 w-6 animate-spin" /> |
|
) : ( |
|
<Mic className="h-6 w-6" /> |
|
)} |
|
</Button> |
|
|
|
<p className="text-white/70 mt-3 text-sm"> |
|
{isListening ? "Click to stop listening" : "Click to start voice chat"} |
|
</p> |
|
</div> |
|
); |
|
}; |
|
|
|
export default VoiceChat; |
|
|