Sergidev commited on
Commit
addbfa5
Β·
1 Parent(s): ea71575
Files changed (2) hide show
  1. app.py +42 -26
  2. requirements.txt +4 -3
app.py CHANGED
@@ -1,49 +1,65 @@
1
  import gradio as gr
2
- from smolagents import CodeAgent, HfApiModel
3
- from gradio_client import Client
4
  import tempfile
5
 
6
- # Initialize Qwen2.5-Omni-7B via SmolAgents
7
- qwen_agent = CodeAgent(
8
- tools=[], # Add multimodal tools as needed
9
- model=HfApiModel("Qwen/Qwen2.5-Omni-7B"),
10
- execution_timeout=120
11
  )
 
12
 
13
- def process_video(video_path, prompt, request: gr.Request):
14
- # Handle ZeroGPU rate limiting
15
  headers = {"X-IP-Token": request.headers.get('x-ip-token', '')}
16
 
17
- # Process video with Qwen's multimodal capabilities
18
- response = qwen_agent.run(
19
- f"Analyze this video: {video_path} and {prompt}",
20
- headers=headers
 
 
 
 
21
  )
22
 
23
- # Generate real-time speech response
24
- audio_response = qwen_agent.model.generate_speech(response)
 
 
 
 
 
25
 
26
- return response, audio_response
 
 
 
27
 
28
  with gr.Blocks() as demo:
29
- gr.Markdown("## Multimodal AI Demo with Qwen2.5-Omni-7B")
30
 
31
  with gr.Row():
32
- video_input = gr.Video(label="Upload Video", sources=["upload"])
33
- prompt_input = gr.Textbox(label="Analysis Prompt")
 
 
 
 
34
 
35
- submit_btn = gr.Button("Analyze")
36
 
37
  with gr.Column():
38
- text_output = gr.Textbox(label="Analysis Results")
39
- audio_output = gr.Audio(label="Voice Response", autoplay=True)
40
 
41
  submit_btn.click(
42
- process_video,
43
- inputs=[video_input, prompt_input, gr.Request()],
44
  outputs=[text_output, audio_output]
45
  )
46
 
47
- # ZeroGPU configuration
48
- demo.queue(default_concurrency_limit=5)
49
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
+ import torch
4
  import tempfile
5
 
6
+ # Initialize Qwen2.5-Omni-7B with multimodal support
7
+ model = AutoModelForCausalLM.from_pretrained(
8
+ "Qwen/Qwen2.5-Omni-7B",
9
+ torch_dtype=torch.float16,
10
+ device_map="auto"
11
  )
12
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Omni-7B")
13
 
14
+ def analyze_media(video_path, prompt, request: gr.Request):
15
+ # ZeroGPU rate limiting headers
16
  headers = {"X-IP-Token": request.headers.get('x-ip-token', '')}
17
 
18
+ # Create multimodal pipeline
19
+ pipe = pipeline(
20
+ "multimodal-generation",
21
+ model=model,
22
+ tokenizer=tokenizer,
23
+ device=model.device,
24
+ max_new_tokens=1024,
25
+ generate_speech=True
26
  )
27
 
28
+ # Process 120s video with TMRoPE alignment
29
+ result = pipe(
30
+ media=video_path,
31
+ text=prompt,
32
+ headers=headers,
33
+ timeout=120
34
+ )
35
 
36
+ # Save speech output to temporary file
37
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
38
+ result["speech"].export(f.name, format="wav")
39
+ return result["text"], f.name
40
 
41
  with gr.Blocks() as demo:
42
+ gr.Markdown("## Qwen2.5-Omni-7B Multimodal Demo")
43
 
44
  with gr.Row():
45
+ media_input = gr.Video(
46
+ label="Upload Video (max 120s)",
47
+ sources=["upload"],
48
+ max_length=120
49
+ )
50
+ prompt_input = gr.Textbox(label="Analysis Prompt", placeholder="Describe or ask about the video...")
51
 
52
+ submit_btn = gr.Button("Analyze", variant="primary")
53
 
54
  with gr.Column():
55
+ text_output = gr.Textbox(label="Analysis Results", interactive=False)
56
+ audio_output = gr.Audio(label="Speech Response", autoplay=True)
57
 
58
  submit_btn.click(
59
+ analyze_media,
60
+ inputs=[media_input, prompt_input, gr.Request()],
61
  outputs=[text_output, audio_output]
62
  )
63
 
64
+ demo.queue(concurrency_count=2)
 
65
  demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
- smolagents>=0.9.0
2
- gradio_client>=1.8.0
3
- qwen2.5-omni
 
 
1
+ torch>=2.3.0
2
+ transformers>=4.41.0
3
+ gradio>=4.26.0
4
+ soundfile>=0.12.1