LLM Voice Chat (Powered by Groq, ElevenLabs, and WebRTC ⚡️)

from fastrtc import (
    ReplyOnPause,
    AdditionalOutputs,
    Stream,
    aggregate_bytes_to_16bit,
    get_twilio_turn_credentials,
    WebRTCError,
    stt,
    audio_to_bytes,
)
import numpy as np
import gradio as gr
from gradio.utils import get_space
from groq import Groq
from elevenlabs import ElevenLabs
from dotenv import load_dotenv
import time
import os
from fastapi import FastAPI

load_dotenv()
groq_client = Groq()
tts_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))


# See "Talk to Claude" in Cookbook for an example of how to keep
# track of the chat history.
def response(
    audio: tuple[int, np.ndarray],
    chatbot: list[dict] | None = None,
):
    try:
        chatbot = chatbot or []
        messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
        start = time.time()
        # text = stt(audio)
        text = groq_client.audio.transcriptions.create(
            file=("audio-file.mp3", audio_to_bytes(audio)),
            model="whisper-large-v3-turbo",
            response_format="verbose_json",
        ).text
        print("transcription", time.time() - start)
        print("prompt", text)
        chatbot.append({"role": "user", "content": text})
        yield AdditionalOutputs(chatbot)
        messages.append({"role": "user", "content": text})
        response_text = (
            groq_client.chat.completions.create(
                model="llama-3.1-8b-instant",
                max_tokens=512,
                messages=messages,  # type: ignore
            )
            .choices[0]
            .message.content
        )

        chatbot.append({"role": "assistant", "content": response_text})

        iterator = tts_client.text_to_speech.convert_as_stream(
            text=response_text,  # type: ignore
            voice_id="JBFqnCBsd6RMkjVDRZzb",
            model_id="eleven_multilingual_v2",
            output_format="pcm_24000",
        )
        for chunk in aggregate_bytes_to_16bit(iterator):
            audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
            yield (24000, audio_array)
        yield AdditionalOutputs(chatbot)
    except Exception as e:
        import traceback

        traceback.print_exc()
        raise WebRTCError(traceback.format_exc())


chatbot = gr.Chatbot(type="messages")
stream = Stream(
    modality="audio",
    mode="send-receive",
    handler=ReplyOnPause(response, input_sample_rate=16000),
    additional_outputs_handler=lambda a, b: b,
    additional_inputs=[chatbot],
    additional_outputs=[chatbot],
    rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
    concurrency_limit=20 if get_space() else None,
)
for id, block in stream.ui.blocks.items():
    if isinstance(block, gr.HTML):
        stream.ui.blocks[id] = gr.HTML(
            """
                <h1 style='text-align: center'>
                LLM Voice Chat (Powered by Groq, ElevenLabs, and WebRTC ⚡️)
                </h1>
                """
        )

# Mount the STREAM UI to the FastAPI app
# Because I don't want to build the UI manually
app = FastAPI()
gr.mount_gradio_app(app, stream.ui, path="/")


if __name__ == "__main__":
    import os

    if (mode := os.getenv("MODE")) == "UI":
        stream.ui.launch(server_port=7860)
    elif mode == "PHONE":
        stream.fastphone(host="0.0.0.0", port=7860)
    else:
        import uvicorn

        uvicorn.run(app, host="0.0.0.0", port=7860)