from fastrtc import ( ReplyOnPause, AdditionalOutputs, Stream, aggregate_bytes_to_16bit, get_twilio_turn_credentials, WebRTCError, stt, audio_to_bytes, ) import numpy as np import gradio as gr from gradio.utils import get_space from groq import Groq from elevenlabs import ElevenLabs from dotenv import load_dotenv import time import os from fastapi import FastAPI load_dotenv() groq_client = Groq() tts_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY")) # See "Talk to Claude" in Cookbook for an example of how to keep # track of the chat history. def response( audio: tuple[int, np.ndarray], chatbot: list[dict] | None = None, ): try: chatbot = chatbot or [] messages = [{"role": d["role"], "content": d["content"]} for d in chatbot] start = time.time() # text = stt(audio) text = groq_client.audio.transcriptions.create( file=("audio-file.mp3", audio_to_bytes(audio)), model="whisper-large-v3-turbo", response_format="verbose_json", ).text print("transcription", time.time() - start) print("prompt", text) chatbot.append({"role": "user", "content": text}) yield AdditionalOutputs(chatbot) messages.append({"role": "user", "content": text}) response_text = ( groq_client.chat.completions.create( model="llama-3.1-8b-instant", max_tokens=512, messages=messages, # type: ignore ) .choices[0] .message.content ) chatbot.append({"role": "assistant", "content": response_text}) iterator = tts_client.text_to_speech.convert_as_stream( text=response_text, # type: ignore voice_id="JBFqnCBsd6RMkjVDRZzb", model_id="eleven_multilingual_v2", output_format="pcm_24000", ) for chunk in aggregate_bytes_to_16bit(iterator): audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1) yield (24000, audio_array) yield AdditionalOutputs(chatbot) except Exception as e: import traceback traceback.print_exc() raise WebRTCError(traceback.format_exc()) chatbot = gr.Chatbot(type="messages") stream = Stream( modality="audio", mode="send-receive", handler=ReplyOnPause(response, input_sample_rate=16000), additional_outputs_handler=lambda a, b: b, additional_inputs=[chatbot], additional_outputs=[chatbot], rtc_configuration=get_twilio_turn_credentials() if get_space() else None, concurrency_limit=20 if get_space() else None, ) for id, block in stream.ui.blocks.items(): if isinstance(block, gr.HTML): stream.ui.blocks[id] = gr.HTML( """