import gradio as gr from smolagents import CodeAgent, HfApiModel from gradio_client import Client import tempfile # Initialize Qwen2.5-Omni-7B via SmolAgents qwen_agent = CodeAgent( tools=[], # Add multimodal tools as needed model=HfApiModel("Qwen/Qwen2.5-Omni-7B"), execution_timeout=120 ) def process_video(video_path, prompt, request: gr.Request): # Handle ZeroGPU rate limiting headers = {"X-IP-Token": request.headers.get('x-ip-token', '')} # Process video with Qwen's multimodal capabilities response = qwen_agent.run( f"Analyze this video: {video_path} and {prompt}", headers=headers ) # Generate real-time speech response audio_response = qwen_agent.model.generate_speech(response) return response, audio_response with gr.Blocks() as demo: gr.Markdown("## Multimodal AI Demo with Qwen2.5-Omni-7B") with gr.Row(): video_input = gr.Video(label="Upload Video", sources=["upload"]) prompt_input = gr.Textbox(label="Analysis Prompt") submit_btn = gr.Button("Analyze") with gr.Column(): text_output = gr.Textbox(label="Analysis Results") audio_output = gr.Audio(label="Voice Response", autoplay=True) submit_btn.click( process_video, inputs=[video_input, prompt_input, gr.Request()], outputs=[text_output, audio_output] ) # ZeroGPU configuration demo.queue(default_concurrency_limit=5) demo.launch(server_name="0.0.0.0", server_port=7860)