Spaces:

Gyaneshere
/

Qwen2.5-Omni-7B-MultimodalInput-to-Speech

Paused

App Files Files Community

Gyaneshere commited on Mar 28

Commit

7d5976d

verified ·

1 Parent(s): da34deb

Create app.py

Browse files

Files changed (1) hide show

app.py +306 -0

app.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import gradio as gr
+import torch
+from transformers import Qwen2_5OmniModel, Qwen2_5OmniProcessor
+from qwen_omni_utils import process_mm_info
+import soundfile as sf
+import tempfile
+import spaces
+# Initialize the model and processor
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
+model = Qwen2_5OmniModel.from_pretrained(
+    "Qwen/Qwen2.5-Omni-7B",
+    torch_dtype=torch_dtype,
+    device_map="auto",
+    enable_audio_output=True,
+    # attn_implementation="flash_attention_2" if torch.cuda.is_available() else None
+)
+processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
+# System prompt
+SYSTEM_PROMPT = {
+    "role": "system",
+    "content": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."
+}
+# Voice options
+VOICE_OPTIONS = {
+    "Chelsie (Female)": "Chelsie",
+    "Ethan (Male)": "Ethan"
+}
+@spaces.GPU
+def process_input(image, audio, video, text, chat_history, voice_type, enable_audio_output):
+    # Combine multimodal inputs
+    user_input = {
+        "text": text,
+        "image": image if image is not None else None,
+        "audio": audio if audio is not None else None,
+        "video": video if video is not None else None
+    }
+    # Prepare conversation history for model processing
+    conversation = [SYSTEM_PROMPT]
+    # Add previous chat history
+    if isinstance(chat_history, list):
+        for item in chat_history:
+            if isinstance(item, tuple) and len(item) == 2:
+                user_msg, bot_msg = item
+                conversation.append({"role": "user", "content": user_input_to_content(user_msg)})
+                conversation.append({"role": "assistant", "content": bot_msg})
+    else:
+        # Initialize chat history if it's not a list
+        chat_history = []
+    # Add current user input
+    conversation.append({"role": "user", "content": user_input_to_content(user_input)})
+    # Prepare for inference
+    text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+    audios, images, videos = process_mm_info(conversation, use_audio_in_video=True)
+    inputs = processor(
+        text=text,
+        audios=audios,
+        images=images,
+        videos=videos,
+        return_tensors="pt",
+        padding=True
+    )
+    inputs = inputs.to(model.device).to(model.dtype)
+    # Generate response
+    if enable_audio_output:
+        voice_type_value = VOICE_OPTIONS.get(voice_type, "Chelsie")
+        text_ids, audio = model.generate(
+            **inputs,
+            use_audio_in_video=True,
+            return_audio=True,
+            spk=voice_type_value
+        )
+        # Save audio to temporary file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            sf.write(
+                tmp_file.name,
+                audio.reshape(-1).detach().cpu().numpy(),
+                samplerate=24000,
+            )
+            audio_path = tmp_file.name
+    else:
+        text_ids = model.generate(
+            **inputs,
+            use_audio_in_video=True,
+            return_audio=False
+        )
+        audio_path = None
+    # Decode text response
+    text_response = processor.batch_decode(
+        text_ids,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False
+    )[0]
+    # Clean up text response
+    text_response = text_response.strip()
+    # Format user message for chat history display
+    user_message_for_display = str(text) if text is not None else ""
+    if image is not None:
+        user_message_for_display = (user_message_for_display or "Image uploaded") + " [Image]"
+    if audio is not None:
+        user_message_for_display = (user_message_for_display or "Audio uploaded") + " [Audio]"
+    if video is not None:
+        user_message_for_display = (user_message_for_display or "Video uploaded") + " [Video]"
+    # If empty, provide a default message
+    if not user_message_for_display.strip():
+        user_message_for_display = "Multimodal input"
+    # Update chat history with properly formatted entries
+    if not isinstance(chat_history, list):
+        chat_history = []
+    chat_history.append((user_message_for_display, text_response))
+    # Prepare output
+    if enable_audio_output and audio_path:
+        return chat_history, text_response, audio_path
+    else:
+        return chat_history, text_response, None
+def user_input_to_content(user_input):
+    if isinstance(user_input, str):
+        return user_input
+    elif isinstance(user_input, dict):
+        # Handle file uploads
+        content = []
+        if "text" in user_input and user_input["text"]:
+            content.append({"type": "text", "text": user_input["text"]})
+        if "image" in user_input and user_input["image"]:
+            content.append({"type": "image", "image": user_input["image"]})
+        if "audio" in user_input and user_input["audio"]:
+            content.append({"type": "audio", "audio": user_input["audio"]})
+        if "video" in user_input and user_input["video"]:
+            content.append({"type": "video", "video": user_input["video"]})
+        return content
+    return user_input
+def create_demo():
+    with gr.Blocks(title="Qwen2.5-Omni Chat Demo", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# Qwen2.5-Omni Multimodal Chat Demo")
+        gr.Markdown("Experience the omni-modal capabilities of Qwen2.5-Omni through text, images, audio, and video interactions.")
+        # Hidden placeholder components for text-only input
+        placeholder_image = gr.Image(type="filepath", visible=False)
+        placeholder_audio = gr.Audio(type="filepath", visible=False)
+        placeholder_video = gr.Video(visible=False)
+        # Chat interface
+        with gr.Row():
+            with gr.Column(scale=3):
+                chatbot = gr.Chatbot(height=600)
+                with gr.Accordion("Advanced Options", open=False):
+                    voice_type = gr.Dropdown(
+                        choices=list(VOICE_OPTIONS.keys()),
+                        value="Chelsie (Female)",
+                        label="Voice Type"
+                    )
+                    enable_audio_output = gr.Checkbox(
+                        value=True,
+                        label="Enable Audio Output"
+                    )
+                # Multimodal input components
+                with gr.Tabs():
+                    with gr.TabItem("Text Input"):
+                        text_input = gr.Textbox(
+                            placeholder="Type your message here...",
+                            label="Text Input"
+                        )
+                        text_submit = gr.Button("Send Text")
+                    with gr.TabItem("Multimodal Input"):
+                        with gr.Row():
+                            image_input = gr.Image(
+                                type="filepath",
+                                label="Upload Image"
+                            )
+                            audio_input = gr.Audio(
+                                type="filepath",
+                                label="Upload Audio"
+                            )
+                        with gr.Row():
+                            video_input = gr.Video(
+                                label="Upload Video"
+                            )
+                        additional_text = gr.Textbox(
+                            placeholder="Additional text message...",
+                            label="Additional Text"
+                        )
+                        multimodal_submit = gr.Button("Send Multimodal Input")
+                clear_button = gr.Button("Clear Chat")
+            with gr.Column(scale=1):
+                gr.Markdown("## Model Capabilities")
+                gr.Markdown("""
+                **Qwen2.5-Omni can:**
+                - Process and understand text
+                - Analyze images and answer questions about them
+                - Transcribe and understand audio
+                - Analyze video content (with or without audio)
+                - Generate natural speech responses
+                """)
+                gr.Markdown("### Example Prompts")
+                gr.Examples(
+                    examples=[
+                        ["Describe what you see in this image", "image"],
+                        ["What is being said in this audio clip?", "audio"],
+                        ["What's happening in this video?", "video"],
+                        ["Explain Artificial Intelligence in simple terms", "text"],
+                        ["Generate a short story about a robot learning to play AlphaGo", "text"]
+                    ],
+                    inputs=[text_input, gr.Textbox(visible=False)],
+                    label="Text Examples"
+                )
+                audio_output = gr.Audio(
+                    label="Model Speech Output",
+                    visible=True,
+                    autoplay=True
+                )
+                text_output = gr.Textbox(
+                    label="Model Text Response",
+                    interactive=False
+                )
+        # Text input handling
+        text_submit.click(
+            fn=lambda text: str(text) if text is not None else "",
+            inputs=text_input,
+            outputs=[chatbot],
+            queue=False
+        ).then(
+            fn=process_input,
+            inputs=[placeholder_image, placeholder_audio, placeholder_video, text_input, chatbot, voice_type, enable_audio_output],
+            outputs=[chatbot, text_output, audio_output]
+        )
+        # Multimodal input handling
+        def prepare_multimodal_input(image, audio, video, text):
+            # Create a display message that indicates what was uploaded
+            display_message = str(text) if text is not None else ""
+            if image is not None:
+                display_message = (display_message + " " if display_message.strip() else "") + "[Image]"
+            if audio is not None:
+                display_message = (display_message + " " if display_message.strip() else "") + "[Audio]"
+            if video is not None:
+                display_message = (display_message + " " if display_message.strip() else "") + "[Video]"
+            if not display_message.strip():
+                display_message = "Multimodal content"
+            return display_message
+        multimodal_submit.click(
+            fn=prepare_multimodal_input,
+            inputs=[image_input, audio_input, video_input, additional_text],
+            outputs=[chatbot],
+            queue=False
+        ).then(
+            fn=process_input,
+            inputs=[image_input, audio_input, video_input, additional_text,
+                   chatbot, voice_type, enable_audio_output],
+            outputs=[chatbot, text_output, audio_output]
+        )
+        # Clear chat
+        def clear_chat():
+            return [], None, None
+        clear_button.click(
+            fn=clear_chat,
+            outputs=[chatbot, text_output, audio_output]
+        )
+        # Update audio output visibility
+        def toggle_audio_output(enable_audio):
+            return gr.Audio(visible=enable_audio)
+        enable_audio_output.change(
+            fn=toggle_audio_output,
+            inputs=enable_audio_output,
+            outputs=audio_output
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch(server_name="0.0.0.0", server_port=7860)