Spaces:
Running
Running
File size: 3,060 Bytes
e6da608 4a472df e6da608 4a472df e6da608 bc0c6f3 4a472df e3f84d5 4a472df bc0c6f3 4a472df bc0c6f3 4a472df bc0c6f3 4a472df bc0c6f3 4a472df bc0c6f3 4a472df e6da608 4a472df b1ea566 bc0c6f3 4a472df 7290ef1 4a472df d0794da 4a472df 7290ef1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import os
import time
import gradio as gr
import numpy as np
from dotenv import load_dotenv
from elevenlabs import ElevenLabs
from fastapi import FastAPI
from fastrtc import (
AdditionalOutputs,
ReplyOnPause,
Stream,
WebRTCError,
get_stt_model,
get_twilio_turn_credentials,
)
from gradio.utils import get_space
from groq import Groq
from numpy.typing import NDArray
load_dotenv()
groq_client = Groq()
tts_client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))
stt_model = get_stt_model()
# See "Talk to Claude" in Cookbook for an example of how to keep
# track of the chat history.
def response(
audio: tuple[int, NDArray[np.int16 | np.float32]],
chatbot: list[dict] | None = None,
):
try:
chatbot = chatbot or []
messages = [{"role": d["role"], "content": d["content"]} for d in chatbot]
start = time.time()
text = stt_model.stt(audio)
print("transcription", time.time() - start)
print("prompt", text)
chatbot.append({"role": "user", "content": text})
yield AdditionalOutputs(chatbot)
messages.append({"role": "user", "content": text})
response_text = (
groq_client.chat.completions.create(
model="llama-3.1-8b-instant",
max_tokens=512,
messages=messages, # type: ignore
)
.choices[0]
.message.content
)
chatbot.append({"role": "assistant", "content": response_text})
for chunk in tts_client.text_to_speech.convert_as_stream(
text=response_text, # type: ignore
voice_id="JBFqnCBsd6RMkjVDRZzb",
model_id="eleven_multilingual_v2",
output_format="pcm_24000",
):
audio_array = np.frombuffer(chunk, dtype=np.int16).reshape(1, -1)
yield (24000, audio_array)
yield AdditionalOutputs(chatbot)
except Exception:
import traceback
traceback.print_exc()
raise WebRTCError(traceback.format_exc())
chatbot = gr.Chatbot(type="messages")
stream = Stream(
modality="audio",
mode="send-receive",
handler=ReplyOnPause(response, input_sample_rate=16000),
additional_outputs_handler=lambda a, b: b,
additional_inputs=[chatbot],
additional_outputs=[chatbot],
rtc_configuration=get_twilio_turn_credentials() if get_space() else None,
concurrency_limit=5 if get_space() else None,
time_limit=90 if get_space() else None,
ui_args={"title": "LLM Voice Chat (Powered by Groq, ElevenLabs, and WebRTC ⚡️)"},
)
# Mount the STREAM UI to the FastAPI app
# Because I don't want to build the UI manually
app = FastAPI()
app = gr.mount_gradio_app(app, stream.ui, path="/")
if __name__ == "__main__":
import os
os.environ["GRADIO_SSR_MODE"] = "false"
if (mode := os.getenv("MODE")) == "UI":
stream.ui.launch(server_port=7860)
elif mode == "PHONE":
stream.fastphone(host="0.0.0.0", port=7860)
else:
stream.ui.launch(server_port=7860)
|