Spaces:
Running
Running
from fastrtc import ( | |
ReplyOnPause, | |
AdditionalOutputs, | |
Stream, | |
aggregate_bytes_to_16bit, | |
get_twilio_turn_credentials, | |
WebRTCError, | |
stt, | |
audio_to_bytes, | |
) | |
import numpy as np | |
import gradio as gr | |
from gradio.utils import get_space | |
from groq import Groq | |
from elevenlabs import ElevenLabs | |
from dotenv import load_dotenv | |
import time | |
import os | |
from fastapi import FastAPI | |
load_dotenv() | |
groq_client = Groq() | |
tts_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY")) | |
# See "Talk to Claude" in Cookbook for an example of how to keep | |
# track of the chat history. | |
def response( | |
audio: tuple[int, np.ndarray], | |
chatbot: list[dict] | None = None, | |
): | |
try: | |
chatbot = chatbot or [] | |
messages = [{"role": d["role"], "content": d["content"]} for d in chatbot] | |
start = time.time() | |
# text = stt(audio) | |
text = groq_client.audio.transcriptions.create( | |
file=("audio-file.mp3", audio_to_bytes(audio)), | |
model="whisper-large-v3-turbo", | |
response_format="verbose_json", | |
).text | |
print("transcription", time.time() - start) | |
print("prompt", text) | |
chatbot.append({"role": "user", "content": text}) | |
yield AdditionalOutputs(chatbot) | |
messages.append({"role": "user", "content": text}) | |
response_text = ( | |
groq_client.chat.completions.create( | |
model="llama-3.1-8b-instant", | |
max_tokens=512, | |
messages=messages, # type: ignore | |
) | |
.choices[0] | |
.message.content | |
) | |
chatbot.append({"role": "assistant", "content": response_text}) | |
iterator = tts_client.text_to_speech.convert_as_stream( | |
text=response_text, # type: ignore | |
voice_id="JBFqnCBsd6RMkjVDRZzb", | |
model_id="eleven_multilingual_v2", | |
output_format="pcm_24000", | |
) | |
for chunk in aggregate_bytes_to_16bit(iterator): | |
audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1) | |
yield (24000, audio_array) | |
yield AdditionalOutputs(chatbot) | |
except Exception as e: | |
import traceback | |
traceback.print_exc() | |
raise WebRTCError(traceback.format_exc()) | |
chatbot = gr.Chatbot(type="messages") | |
stream = Stream( | |
modality="audio", | |
mode="send-receive", | |
handler=ReplyOnPause(response, input_sample_rate=16000), | |
additional_outputs_handler=lambda a, b: b, | |
additional_inputs=[chatbot], | |
additional_outputs=[chatbot], | |
rtc_configuration=get_twilio_turn_credentials() if get_space() else None, | |
concurrency_limit=20 if get_space() else None, | |
) | |
for id, block in stream.ui.blocks.items(): | |
if isinstance(block, gr.HTML): | |
stream.ui.blocks[id] = gr.HTML( | |
""" | |
<h1 style='text-align: center'> | |
LLM Voice Chat (Powered by Groq, ElevenLabs, and WebRTC ⚡️) | |
</h1> | |
""" | |
) | |
# Mount the STREAM UI to the FastAPI app | |
# Because I don't want to build the UI manually | |
app = FastAPI() | |
gr.mount_gradio_app(app, stream.ui, path="/") | |
if __name__ == "__main__": | |
import os | |
if (mode := os.getenv("MODE")) == "UI": | |
stream.ui.launch(server_port=7860) | |
elif mode == "PHONE": | |
stream.fastphone(host="0.0.0.0", port=7860) | |
else: | |
import uvicorn | |
uvicorn.run(app, host="0.0.0.0", port=7860) | |