Spaces:
Sleeping
Sleeping
File size: 3,731 Bytes
7985f98 0b8cb49 304180d 0b8cb49 304180d 0b8cb49 5cb0b21 054e76a ae20481 304180d 9cc0a3d 304180d 36935e6 304180d 5cb0b21 7985f98 9d561cc 01426a4 9d561cc 054e76a 0b8cb49 ae20481 9d561cc 0b8cb49 a642e14 0b8cb49 5cb0b21 0b8cb49 9d561cc 7985f98 0b8cb49 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import gradio as gr
from gradio_webrtc import WebRTC, AdditionalOutputs, ReplyOnPause
from pydub import AudioSegment
from io import BytesIO
import numpy as np
import librosa
import tempfile
from twilio.rest import Client
import os
import spaces
import uuid
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
import logging
# Configure the root logger to WARNING to suppress debug messages from other libraries
logging.basicConfig(level=logging.WARNING)
# Create a console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
# Create a formatter
formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s")
console_handler.setFormatter(formatter)
# Configure the logger for your specific library
logger = logging.getLogger("gradio_webrtc")
logger.setLevel(logging.DEBUG)
logger.addHandler(console_handler)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
if account_sid and auth_token:
client = Client(account_sid, auth_token)
token = client.tokens.create()
rtc_configuration = {
"iceServers": token.ice_servers,
"iceTransportPolicy": "relay",
}
else:
rtc_configuration = None
def yield_audio(audio: tuple[int, np.ndarray]):
yield AdditionalOutputs(audio)
@spaces.GPU
def respond(transformers_convo: list[dict], gradio_convo: list[dict], audio: tuple[int, np.ndarray], ):
segment = AudioSegment(audio[1].tobytes(), frame_rate=audio[0], sample_width=audio[1].dtype.itemsize, channels=1)
name = str(uuid.uuid4()) + ".mp3"
segment.export(name, format="mp3")
transformers_convo.append({"role": "user", "content": [{"type": "audio", "audio_url": name}]})
gradio_convo.append({"role": "assistant", "content": gr.Audio(value=name)})
text = processor.apply_chat_template(transformers_convo, add_generation_prompt=True, tokenize=False)
audios = []
for message in transformers_convo:
if isinstance(message["content"], list):
for ele in message["content"]:
if ele["type"] == "audio":
audios.append(librosa.load(
BytesIO(open(ele['audio_url'], "rb").read()),
sr=processor.feature_extractor.sampling_rate)[0]
)
inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
inputs = dict(**inputs)
inputs["input_ids"] = inputs["input_ids"].to("cuda:0")
generate_ids = model.generate(**inputs, max_length=256)
generate_ids = generate_ids[:, inputs["input_ids"].size(1):]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
transformers_convo.append({"role": "assistant", "content": response})
gradio_convo.append({"role": "assistant", "content": response})
yield transformers_convo, gradio_convo
with gr.Blocks() as demo:
transformers_convo = gr.State(value=[])
with gr.Row():
with gr.Column():
audio = WebRTC(
rtc_configuration=rtc_configuration,
label="Stream",
mode="send",
modality="audio",
)
with gr.Column():
transcript = gr.Chatbot(label="transcript", type="messages")
audio.stream(ReplyOnPause(yield_audio), inputs=[audio], outputs=[audio])
audio.on_additional_outputs(respond, outputs=[transformers_convo, transcript])
if __name__ == "__main__":
demo.launch() |