freddyaboulton HF staff commited on
Commit
0b8cb49
·
1 Parent(s): 304180d
Files changed (1) hide show
  1. app.py +55 -7
app.py CHANGED
@@ -1,17 +1,65 @@
1
  import gradio as gr
2
- from gradio_webrtc import WebRTC, AdditionalOutputs
3
-
4
  from io import BytesIO
5
- from urllib.request import urlopen
6
  import librosa
 
7
  from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
8
 
9
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
10
  model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
11
 
12
 
13
- def greet(name):
14
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
17
- demo.launch()
 
1
  import gradio as gr
2
+ from gradio_webrtc import WebRTC, AdditionalOutputs, ReplyOnPause
3
+ from pydub import AudioSegment
4
  from io import BytesIO
5
+ import numpy as np
6
  import librosa
7
+ import tempfile
8
  from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
9
 
10
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
11
  model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
12
 
13
 
14
+ def transcribe(audio: tuple[int, np.ndarray], transformers_convo: list[dict], gradio_convo: list[dict]):
15
+ segment = AudioSegment(
16
+ audio[1].tobytes(),
17
+ frame_rate=audio[0],
18
+ sample_width=audio[1].dtype.itemsize,
19
+ channels=1,
20
+ )
21
+
22
+ with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_audio:
23
+ segment.export(temp_audio.name, format="mp3")
24
+ transformers_convo.append({"role": "user", "content": [{"type": "audio", "audio_url": temp_audio.name}]})
25
+ gradio_convo.append({"role": "assistant", "content": gr.Audio(value=temp_audio.name)})
26
+ text = processor.apply_chat_template(transformers_convo, add_generation_prompt=True, tokenize=False)
27
+ audios = []
28
+ for message in transformers_convo:
29
+ if isinstance(message["content"], list):
30
+ for ele in message["content"]:
31
+ if ele["type"] == "audio":
32
+ audios.append(librosa.load(
33
+ BytesIO(open(ele['audio_url'], "rb").read()),
34
+ sr=processor.feature_extractor.sampling_rate)[0]
35
+ )
36
+ inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
37
+ inputs.input_ids = inputs.input_ids.to("cuda")
38
+
39
+ generate_ids = model.generate(**inputs, max_length=256)
40
+ generate_ids = generate_ids[:, inputs.input_ids.size(1):]
41
+ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
42
+ print("response", response)
43
+ transformers_convo.append({"role": "assistant", "content": response})
44
+ gradio_convo.append({"role": "assistant", "content": response})
45
+
46
+ yield AdditionalOutputs(transformers_convo, gradio_convo)
47
+
48
+
49
+ with gr.Blocks() as demo:
50
+ transformers_convo = gr.State()
51
+ with gr.Row():
52
+ with gr.Column():
53
+ audio = WebRTC(
54
+ label="Stream",
55
+ mode="send",
56
+ modality="audio",
57
+ )
58
+ with gr.Column():
59
+ transcript = gr.Chatbot(label="transcript", type="messages")
60
+
61
+ audio.stream(ReplyOnPause(transcribe), inputs=[audio, transformers_convo, transcript], outputs=[audio])
62
+ audio.on_additional_outputs(lambda s: s, outputs=[transformers_convo, transcript])
63
 
64
+ if __name__ == "__main__":
65
+ demo.launch()