freddyaboulton's picture
add code
0471c24
raw
history blame
3.05 kB
import gradio as gr
from gradio_webrtc import WebRTC, ReplyOnPause, AdditionalOutputs
import transformers
import numpy as np
from twilio.rest import Client
import os
import torch
import librosa
pipe = transformers.pipeline(model='fixie-ai/ultravox-v0_4_1-llama-3_1-8b', trust_remote_code=True,
device=torch.device('cuda'))
whisper = transformers.pipeline(model="openai/whisper-large-v3-turbo",
device=torch.device('cuda'))
account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
if account_sid and auth_token:
client = Client(account_sid, auth_token)
token = client.tokens.create()
rtc_configuration = {
"iceServers": token.ice_servers,
"iceTransportPolicy": "relay",
}
else:
rtc_configuration = None
def transcribe(audio: tuple[int, np.ndarray], conversation: list[dict], gradio_convo: list[dict]):
original_sr = audio[0]
target_sr = 16000
audio_sr = librosa.resample(audio[1].astype(np.float32) / 32768.0,
orig_sr=original_sr, target_sr=target_sr)
output = pipe({"audio": audio_sr, "turns": conversation, "sampling_rate": target_sr},
max_new_tokens=512)
transcription = whisper({"array": audio_sr.squeeze(), "sampling_rate": target_sr})
conversation.append({"role": "user", "content": transcription})
conversation.append({"role": "assistant", "content": output})
gradio_convo.append({"role": "user", "content": transcription})
gradio_convo.append({"role": "assistant", "content": output})
yield AdditionalOutputs(conversation, gradio_convo)
with gr.Blocks() as demo:
gr.HTML(
"""
<h1 style='text-align: center'>
Talk to Ultravox Llama 3.1 8b (Powered by WebRTC ⚡️)
</h1>
<p style='text-align: center'>
Once you grant access to your microphone, you can talk naturally to Ultravox.
When you stop talking, the audio will be sent for processing.
</p>
<p style='text-align: center'>
Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation.
</p>
"""
)
transformers_convo = gr.State(value=[{
"role": "system",
"content": "You are a friendly and helpful character. You love to answer questions for people."
}])
with gr.Row():
with gr.Column():
audio = WebRTC(
rtc_configuration=rtc_configuration,
label="Stream",
mode="send",
modality="audio",
)
with gr.Column():
transcript = gr.Chatbot(label="transcript", type="messages")
audio.stream(ReplyOnPause(transcribe), inputs=[audio, transformers_convo, transcript], outputs=[audio], time_limit=90)
audio.on_additional_outputs(lambda s,a: (s,a), outputs=[transformers_convo, transcript],
queue=False, show_progress="hidden")
if __name__ == "__main__":
demo.launch()