File size: 3,284 Bytes
7985f98
0b8cb49
 
304180d
0b8cb49
304180d
0b8cb49
5cb0b21
 
304180d
 
 
 
 
5cb0b21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7985f98
0b8cb49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5cb0b21
0b8cb49
 
 
 
 
 
 
 
 
7985f98
0b8cb49
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import gradio as gr
from gradio_webrtc import WebRTC, AdditionalOutputs, ReplyOnPause
from pydub import AudioSegment
from io import BytesIO
import numpy as np
import librosa
import tempfile
from twilio.rest import Client
import os
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")

account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
auth_token = os.environ.get("TWILIO_AUTH_TOKEN")

if account_sid and auth_token:
    client = Client(account_sid, auth_token)

    token = client.tokens.create()

    rtc_configuration = {
        "iceServers": token.ice_servers,
        "iceTransportPolicy": "relay",
    }
else:
    rtc_configuration = None


def transcribe(audio: tuple[int, np.ndarray], transformers_convo: list[dict], gradio_convo: list[dict]):
    segment = AudioSegment(
        audio[1].tobytes(),
        frame_rate=audio[0],
        sample_width=audio[1].dtype.itemsize,
        channels=1,
    )

    with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_audio:
        segment.export(temp_audio.name, format="mp3")
        transformers_convo.append({"role": "user", "content": [{"type": "audio", "audio_url": temp_audio.name}]})
        gradio_convo.append({"role": "assistant", "content": gr.Audio(value=temp_audio.name)})
        text = processor.apply_chat_template(transformers_convo, add_generation_prompt=True, tokenize=False)
        audios = []
        for message in transformers_convo:
            if isinstance(message["content"], list):
                for ele in message["content"]:
                    if ele["type"] == "audio":
                        audios.append(librosa.load(
                            BytesIO(open(ele['audio_url'], "rb").read()), 
                            sr=processor.feature_extractor.sampling_rate)[0]
                        )
        inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
        inputs.input_ids = inputs.input_ids.to("cuda")

        generate_ids = model.generate(**inputs, max_length=256)
        generate_ids = generate_ids[:, inputs.input_ids.size(1):]
        response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        print("response", response)
        transformers_convo.append({"role": "assistant", "content": response})
        gradio_convo.append({"role": "assistant", "content": response})

        yield AdditionalOutputs(transformers_convo, gradio_convo)


with gr.Blocks() as demo:
    transformers_convo = gr.State()
    with gr.Row():
        with gr.Column():
            audio = WebRTC(
                rtc_configuration=rtc_configuration,
                label="Stream",
                mode="send",
                modality="audio",
            )
        with gr.Column():
            transcript = gr.Chatbot(label="transcript", type="messages")

    audio.stream(ReplyOnPause(transcribe), inputs=[audio, transformers_convo, transcript], outputs=[audio])
    audio.on_additional_outputs(lambda s: s, outputs=[transformers_convo, transcript])

if __name__ == "__main__":
    demo.launch()