Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,7 @@ import torch
|
|
7 |
import edge_tts
|
8 |
import asyncio
|
9 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
10 |
-
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
|
11 |
from transformers.image_utils import load_image
|
12 |
import time
|
13 |
|
@@ -82,18 +82,13 @@ def generate(
|
|
82 |
files = input_dict.get("files", [])
|
83 |
|
84 |
# Check if input includes image(s)
|
85 |
-
|
86 |
-
images = [load_image(image) for image in files]
|
87 |
-
elif len(files) == 1:
|
88 |
-
images = [load_image(files[0])]
|
89 |
-
else:
|
90 |
-
images = []
|
91 |
|
92 |
# Check if message is for TTS
|
93 |
tts_prefix = "@tts"
|
94 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 7))
|
95 |
voice_index = next((i for i in range(1, 7) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
|
96 |
-
|
97 |
if is_tts and voice_index:
|
98 |
voice = TTS_VOICES[voice_index - 1]
|
99 |
text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
|
@@ -114,7 +109,6 @@ def generate(
|
|
114 |
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
115 |
inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
|
116 |
|
117 |
-
# Handle generation for multimodal input
|
118 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
119 |
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
|
120 |
|
@@ -163,8 +157,11 @@ def generate(
|
|
163 |
yield final_response
|
164 |
|
165 |
if is_tts and voice:
|
166 |
-
|
167 |
-
|
|
|
|
|
|
|
168 |
yield gr.Audio(output_file, autoplay=True)
|
169 |
|
170 |
demo = gr.ChatInterface(
|
|
|
7 |
import edge_tts
|
8 |
import asyncio
|
9 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
10 |
+
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
|
11 |
from transformers.image_utils import load_image
|
12 |
import time
|
13 |
|
|
|
82 |
files = input_dict.get("files", [])
|
83 |
|
84 |
# Check if input includes image(s)
|
85 |
+
images = [load_image(image) for image in files] if files else []
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
# Check if message is for TTS
|
88 |
tts_prefix = "@tts"
|
89 |
is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 7))
|
90 |
voice_index = next((i for i in range(1, 7) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
|
91 |
+
|
92 |
if is_tts and voice_index:
|
93 |
voice = TTS_VOICES[voice_index - 1]
|
94 |
text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
|
|
|
109 |
prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
110 |
inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
|
111 |
|
|
|
112 |
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
|
113 |
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
|
114 |
|
|
|
157 |
yield final_response
|
158 |
|
159 |
if is_tts and voice:
|
160 |
+
loop = asyncio.new_event_loop()
|
161 |
+
asyncio.set_event_loop(loop)
|
162 |
+
output_file = loop.run_until_complete(text_to_speech(final_response, voice))
|
163 |
+
|
164 |
+
# Separate yield for audio output
|
165 |
yield gr.Audio(output_file, autoplay=True)
|
166 |
|
167 |
demo = gr.ChatInterface(
|