File size: 3,043 Bytes
1ac399b
 
 
 
 
 
b2051b3
 
1ac399b
 
b2051b3
 
6d7b9dd
 
1ac399b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f9d568
b2051b3
 
 
2d88e5d
 
1ac399b
b2051b3
 
6d7b9dd
1ac399b
6d7b9dd
449d4d5
6d7b9dd
449d4d5
1ac399b
7f9d568
1ac399b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
from gradio_webrtc import WebRTC, ReplyOnPause, AdditionalOutputs
import transformers
import numpy as np
from twilio.rest import Client
import os
import torch
import librosa


pipe = transformers.pipeline(model='fixie-ai/ultravox-v0_4_1-llama-3_1-8b', trust_remote_code=True,
                             device=torch.device('cuda'))
whisper = transformers.pipeline(model="openai/whisper-large-v3-turbo",
                                device=torch.device('cuda'))

account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
auth_token = os.environ.get("TWILIO_AUTH_TOKEN")

if account_sid and auth_token:
    client = Client(account_sid, auth_token)

    token = client.tokens.create()

    rtc_configuration = {
        "iceServers": token.ice_servers,
        "iceTransportPolicy": "relay",
    }
else:
    rtc_configuration = None



def transcribe(audio: tuple[int, np.ndarray], conversation: list[dict], gradio_convo: list[dict]):
    original_sr = audio[0]
    target_sr = 16000

    audio_sr = librosa.resample(audio[1].astype(np.float32) / 32768.0,
                                orig_sr=original_sr, target_sr=target_sr)
   
    output = pipe({"audio": audio_sr, "turns": conversation, "sampling_rate": target_sr},
                  max_new_tokens=512)
    transcription = whisper({"array": audio_sr, "sampling_rate": target_sr})

    conversation.append({"role": "user", "content": transcription})
    conversation.append({"role": "assistant", "content": output})
    gradio_convo.append({"role": "user", "content": transcription})
    gradio_convo.append({"role": "assistant", "content": output})

    yield AdditionalOutputs(conversation, gradio_convo)


with gr.Blocks() as demo:
    gr.HTML(
    """
    <h1 style='text-align: center'>
    Talk to Ultravox Llama 3.1 8b (Powered by WebRTC ⚡️)
    </h1>
    <p style='text-align: center'>
    Once you grant access to your microphone, you can talk naturally to Ultravox.
    When you stop talking, the audio will be sent for processing.
    </p>
    <p style='text-align: center'>
    Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation.
    </p>
    """
    )
    transformers_convo = gr.State(value=[{
        "role": "system",
        "content": "You are a friendly and helpful character. You love to answer questions for people."
        }])
    with gr.Row():
        with gr.Column():
            audio = WebRTC(
                rtc_configuration=rtc_configuration,
                label="Stream",
                mode="send",
                modality="audio",
            )
        with gr.Column():
            transcript = gr.Chatbot(label="transcript", type="messages")

    audio.stream(ReplyOnPause(transcribe), inputs=[audio, transformers_convo, transcript], outputs=[audio], time_limit=90)
    audio.on_additional_outputs(lambda s,a: (s,a), outputs=[transformers_convo, transcript],
                                queue=False, show_progress="hidden")

if __name__ == "__main__":
    demo.launch()