Gyaneshere commited on
Commit
7d5976d
·
verified ·
1 Parent(s): da34deb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +306 -0
app.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor
4
+ from qwen_omni_utils import process_mm_info
5
+ import soundfile as sf
6
+ import tempfile
7
+ import spaces
8
+
9
+ # Initialize the model and processor
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
+ torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
12
+
13
+ model = Qwen2_5OmniModel.from_pretrained(
14
+ "Qwen/Qwen2.5-Omni-7B",
15
+ torch_dtype=torch_dtype,
16
+ device_map="auto",
17
+ enable_audio_output=True,
18
+ # attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
19
+ )
20
+
21
+ processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
22
+
23
+ # System prompt
24
+ SYSTEM_PROMPT = {
25
+ "role": "system",
26
+ "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
27
+ }
28
+
29
+ # Voice options
30
+ VOICE_OPTIONS = {
31
+ "Chelsie (Female)": "Chelsie",
32
+ "Ethan (Male)": "Ethan"
33
+ }
34
+
35
+ @spaces.GPU
36
+ def process_input(image, audio, video, text, chat_history, voice_type, enable_audio_output):
37
+ # Combine multimodal inputs
38
+ user_input = {
39
+ "text": text,
40
+ "image": image if image is not None else None,
41
+ "audio": audio if audio is not None else None,
42
+ "video": video if video is not None else None
43
+ }
44
+
45
+ # Prepare conversation history for model processing
46
+ conversation = [SYSTEM_PROMPT]
47
+
48
+ # Add previous chat history
49
+ if isinstance(chat_history, list):
50
+ for item in chat_history:
51
+ if isinstance(item, tuple) and len(item) == 2:
52
+ user_msg, bot_msg = item
53
+ conversation.append({"role": "user", "content": user_input_to_content(user_msg)})
54
+ conversation.append({"role": "assistant", "content": bot_msg})
55
+ else:
56
+ # Initialize chat history if it's not a list
57
+ chat_history = []
58
+
59
+ # Add current user input
60
+ conversation.append({"role": "user", "content": user_input_to_content(user_input)})
61
+
62
+ # Prepare for inference
63
+ text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
64
+ audios, images, videos = process_mm_info(conversation, use_audio_in_video=True)
65
+
66
+ inputs = processor(
67
+ text=text,
68
+ audios=audios,
69
+ images=images,
70
+ videos=videos,
71
+ return_tensors="pt",
72
+ padding=True
73
+ )
74
+ inputs = inputs.to(model.device).to(model.dtype)
75
+
76
+ # Generate response
77
+ if enable_audio_output:
78
+ voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie")
79
+ text_ids, audio = model.generate(
80
+ **inputs,
81
+ use_audio_in_video=True,
82
+ return_audio=True,
83
+ spk=voice_type_value
84
+ )
85
+
86
+ # Save audio to temporary file
87
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
88
+ sf.write(
89
+ tmp_file.name,
90
+ audio.reshape(-1).detach().cpu().numpy(),
91
+ samplerate=24000,
92
+ )
93
+ audio_path = tmp_file.name
94
+ else:
95
+ text_ids = model.generate(
96
+ **inputs,
97
+ use_audio_in_video=True,
98
+ return_audio=False
99
+ )
100
+ audio_path = None
101
+
102
+ # Decode text response
103
+ text_response = processor.batch_decode(
104
+ text_ids,
105
+ skip_special_tokens=True,
106
+ clean_up_tokenization_spaces=False
107
+ )[0]
108
+
109
+ # Clean up text response
110
+ text_response = text_response.strip()
111
+
112
+ # Format user message for chat history display
113
+ user_message_for_display = str(text) if text is not None else ""
114
+ if image is not None:
115
+ user_message_for_display = (user_message_for_display or "Image uploaded") + " [Image]"
116
+ if audio is not None:
117
+ user_message_for_display = (user_message_for_display or "Audio uploaded") + " [Audio]"
118
+ if video is not None:
119
+ user_message_for_display = (user_message_for_display or "Video uploaded") + " [Video]"
120
+
121
+ # If empty, provide a default message
122
+ if not user_message_for_display.strip():
123
+ user_message_for_display = "Multimodal input"
124
+
125
+ # Update chat history with properly formatted entries
126
+ if not isinstance(chat_history, list):
127
+ chat_history = []
128
+ chat_history.append((user_message_for_display, text_response))
129
+
130
+ # Prepare output
131
+ if enable_audio_output and audio_path:
132
+ return chat_history, text_response, audio_path
133
+ else:
134
+ return chat_history, text_response, None
135
+
136
+ def user_input_to_content(user_input):
137
+ if isinstance(user_input, str):
138
+ return user_input
139
+ elif isinstance(user_input, dict):
140
+ # Handle file uploads
141
+ content = []
142
+ if "text" in user_input and user_input["text"]:
143
+ content.append({"type": "text", "text": user_input["text"]})
144
+ if "image" in user_input and user_input["image"]:
145
+ content.append({"type": "image", "image": user_input["image"]})
146
+ if "audio" in user_input and user_input["audio"]:
147
+ content.append({"type": "audio", "audio": user_input["audio"]})
148
+ if "video" in user_input and user_input["video"]:
149
+ content.append({"type": "video", "video": user_input["video"]})
150
+ return content
151
+ return user_input
152
+
153
+ def create_demo():
154
+ with gr.Blocks(title="Qwen2.5-Omni Chat Demo", theme=gr.themes.Soft()) as demo:
155
+ gr.Markdown("# Qwen2.5-Omni Multimodal Chat Demo")
156
+ gr.Markdown("Experience the omni-modal capabilities of Qwen2.5-Omni through text, images, audio, and video interactions.")
157
+
158
+ # Hidden placeholder components for text-only input
159
+ placeholder_image = gr.Image(type="filepath", visible=False)
160
+ placeholder_audio = gr.Audio(type="filepath", visible=False)
161
+ placeholder_video = gr.Video(visible=False)
162
+
163
+ # Chat interface
164
+ with gr.Row():
165
+ with gr.Column(scale=3):
166
+ chatbot = gr.Chatbot(height=600)
167
+ with gr.Accordion("Advanced Options", open=False):
168
+ voice_type = gr.Dropdown(
169
+ choices=list(VOICE_OPTIONS.keys()),
170
+ value="Chelsie (Female)",
171
+ label="Voice Type"
172
+ )
173
+ enable_audio_output = gr.Checkbox(
174
+ value=True,
175
+ label="Enable Audio Output"
176
+ )
177
+
178
+ # Multimodal input components
179
+ with gr.Tabs():
180
+ with gr.TabItem("Text Input"):
181
+ text_input = gr.Textbox(
182
+ placeholder="Type your message here...",
183
+ label="Text Input"
184
+ )
185
+ text_submit = gr.Button("Send Text")
186
+
187
+ with gr.TabItem("Multimodal Input"):
188
+ with gr.Row():
189
+ image_input = gr.Image(
190
+ type="filepath",
191
+ label="Upload Image"
192
+ )
193
+ audio_input = gr.Audio(
194
+ type="filepath",
195
+ label="Upload Audio"
196
+ )
197
+ with gr.Row():
198
+ video_input = gr.Video(
199
+ label="Upload Video"
200
+ )
201
+ additional_text = gr.Textbox(
202
+ placeholder="Additional text message...",
203
+ label="Additional Text"
204
+ )
205
+ multimodal_submit = gr.Button("Send Multimodal Input")
206
+
207
+ clear_button = gr.Button("Clear Chat")
208
+
209
+ with gr.Column(scale=1):
210
+ gr.Markdown("## Model Capabilities")
211
+ gr.Markdown("""
212
+ **Qwen2.5-Omni can:**
213
+ - Process and understand text
214
+ - Analyze images and answer questions about them
215
+ - Transcribe and understand audio
216
+ - Analyze video content (with or without audio)
217
+ - Generate natural speech responses
218
+ """)
219
+
220
+ gr.Markdown("### Example Prompts")
221
+ gr.Examples(
222
+ examples=[
223
+ ["Describe what you see in this image", "image"],
224
+ ["What is being said in this audio clip?", "audio"],
225
+ ["What's happening in this video?", "video"],
226
+ ["Explain Artificial Intelligence in simple terms", "text"],
227
+ ["Generate a short story about a robot learning to play AlphaGo", "text"]
228
+ ],
229
+ inputs=[text_input, gr.Textbox(visible=False)],
230
+ label="Text Examples"
231
+ )
232
+
233
+ audio_output = gr.Audio(
234
+ label="Model Speech Output",
235
+ visible=True,
236
+ autoplay=True
237
+ )
238
+ text_output = gr.Textbox(
239
+ label="Model Text Response",
240
+ interactive=False
241
+ )
242
+
243
+ # Text input handling
244
+ text_submit.click(
245
+ fn=lambda text: str(text) if text is not None else "",
246
+ inputs=text_input,
247
+ outputs=[chatbot],
248
+ queue=False
249
+ ).then(
250
+ fn=process_input,
251
+ inputs=[placeholder_image, placeholder_audio, placeholder_video, text_input, chatbot, voice_type, enable_audio_output],
252
+ outputs=[chatbot, text_output, audio_output]
253
+ )
254
+
255
+ # Multimodal input handling
256
+ def prepare_multimodal_input(image, audio, video, text):
257
+ # Create a display message that indicates what was uploaded
258
+ display_message = str(text) if text is not None else ""
259
+ if image is not None:
260
+ display_message = (display_message + " " if display_message.strip() else "") + "[Image]"
261
+ if audio is not None:
262
+ display_message = (display_message + " " if display_message.strip() else "") + "[Audio]"
263
+ if video is not None:
264
+ display_message = (display_message + " " if display_message.strip() else "") + "[Video]"
265
+
266
+ if not display_message.strip():
267
+ display_message = "Multimodal content"
268
+
269
+ return display_message
270
+
271
+ multimodal_submit.click(
272
+ fn=prepare_multimodal_input,
273
+ inputs=[image_input, audio_input, video_input, additional_text],
274
+ outputs=[chatbot],
275
+ queue=False
276
+ ).then(
277
+ fn=process_input,
278
+ inputs=[image_input, audio_input, video_input, additional_text,
279
+ chatbot, voice_type, enable_audio_output],
280
+ outputs=[chatbot, text_output, audio_output]
281
+ )
282
+
283
+ # Clear chat
284
+ def clear_chat():
285
+ return [], None, None
286
+
287
+ clear_button.click(
288
+ fn=clear_chat,
289
+ outputs=[chatbot, text_output, audio_output]
290
+ )
291
+
292
+ # Update audio output visibility
293
+ def toggle_audio_output(enable_audio):
294
+ return gr.Audio(visible=enable_audio)
295
+
296
+ enable_audio_output.change(
297
+ fn=toggle_audio_output,
298
+ inputs=enable_audio_output,
299
+ outputs=audio_output
300
+ )
301
+
302
+ return demo
303
+
304
+ if __name__ == "__main__":
305
+ demo = create_demo()
306
+ demo.launch(server_name="0.0.0.0", server_port=7860)