Spaces:
Running
Running
File size: 1,943 Bytes
16588c5 707450c 556d852 507486b 556d852 707450c 507486b 556d852 507486b 556d852 507486b 556d852 507486b 556d852 507486b a72f0f9 556d852 507486b 556d852 a72f0f9 507486b 556d852 16588c5 556d852 507486b 556d852 507486b 556d852 707450c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
# Load model and processor
model_name = "Qwen/Qwen2.5-Omni-3B"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
processor = AutoProcessor.from_pretrained(model_name)
device = model.device
# Function to process inputs and generate response
def process_input(text_input, image_input=None, audio_input=None, video_input=None):
conversation = [
{"role": "user", "content": [{"text": text_input}]}
]
if image_input:
conversation[0]["content"].append({"image": image_input})
if audio_input:
conversation[0]["content"].append({"audio": audio_input})
if video_input:
conversation[0]["content"].append({"video": video_input})
# Process conversation
model_inputs = processor.apply_chat_template(conversation, return_tensors="pt").to(device)
# Generate response
outputs = model.generate(**model_inputs, max_length=200)
response_text = processor.decode(outputs[0], skip_special_tokens=True)
# Audio output not implemented
response_audio = None
return response_text, response_audio
# Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Qwen2.5-Omni-3B Demo")
with gr.Row():
text_input = gr.Textbox(label="Text Input")
image_input = gr.Image(label="Upload Image", type="filepath")
audio_input = gr.Audio(label="Upload Audio", type="filepath")
video_input = gr.Video(label="Upload Video", type="filepath")
submit_button = gr.Button("Submit")
text_output = gr.Textbox(label="Text Response")
audio_output = gr.Audio(label="Audio Response")
submit_button.click(
fn=process_input,
inputs=[text_input, image_input, audio_input, video_input],
outputs=[text_output, audio_output]
)
# Launch the app
demo.launch()
|