import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer import os # Set model and tokenizer model_name = "Qwen/Qwen2.5-Omni-3B" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto") # Function to process inputs and generate response def process_input(text_input, image_input=None, audio_input=None): inputs = {"text": text_input} if image_input: inputs["image"] = image_input if audio_input: inputs["audio"] = audio_input # Tokenize inputs (simplified for demo) input_ids = tokenizer.encode(inputs["text"], return_tensors="pt").to(model.device) # Generate response outputs = model.generate(input_ids, max_length=200) response_text = tokenizer.decode(outputs[0], skip_special_tokens=True) # Placeholder for speech generation (requires additional setup) response_audio = None # Implement speech generation if needed return response_text, response_audio # Gradio interface with gr.Blocks() as demo: gr.Markdown("# Qwen2.5-Omni-3B Demo") with gr.Row(): text_input = gr.Textbox(label="Text Input") image_input = gr.Image(label="Upload Image") audio_input = gr.Audio(label="Upload Audio") submit_button = gr.Button("Submit") text_output = gr.Textbox(label="Text Response") audio_output = gr.Audio(label="Audio Response") submit_button.click( fn=process_input, inputs=[text_input, image_input, audio_input], outputs=[text_output, audio_output] ) # Launch the app demo.launch()