Spaces:
Running
Running
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoProcessor | |
| import torch | |
| # Load model and processor | |
| model_name = "Qwen/Qwen2.5-Omni-3B" | |
| model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto") | |
| processor = AutoProcessor.from_pretrained(model_name) | |
| device = model.device | |
| # Function to process inputs and generate response | |
| def process_input(text_input, image_input=None, audio_input=None, video_input=None): | |
| conversation = [ | |
| {"role": "user", "content": [{"text": text_input}]} | |
| ] | |
| if image_input: | |
| conversation[0]["content"].append({"image": image_input}) | |
| if audio_input: | |
| conversation[0]["content"].append({"audio": audio_input}) | |
| if video_input: | |
| conversation[0]["content"].append({"video": video_input}) | |
| # Process conversation | |
| model_inputs = processor.apply_chat_template(conversation, return_tensors="pt").to(device) | |
| # Generate response | |
| outputs = model.generate(**model_inputs, max_length=200) | |
| response_text = processor.decode(outputs[0], skip_special_tokens=True) | |
| # Audio output not implemented | |
| response_audio = None | |
| return response_text, response_audio | |
| # Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Qwen2.5-Omni-3B Demo") | |
| with gr.Row(): | |
| text_input = gr.Textbox(label="Text Input") | |
| image_input = gr.Image(label="Upload Image", type="filepath") | |
| audio_input = gr.Audio(label="Upload Audio", type="filepath") | |
| video_input = gr.Video(label="Upload Video", type="filepath") | |
| submit_button = gr.Button("Submit") | |
| text_output = gr.Textbox(label="Text Response") | |
| audio_output = gr.Audio(label="Audio Response") | |
| submit_button.click( | |
| fn=process_input, | |
| inputs=[text_input, image_input, audio_input, video_input], | |
| outputs=[text_output, audio_output] | |
| ) | |
| # Launch the app | |
| demo.launch() | |