prithivMLmods commited on
Commit
f8a9b16
·
verified ·
1 Parent(s): 2aadb64

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -11
app.py CHANGED
@@ -7,7 +7,7 @@ import torch
7
  import edge_tts
8
  import asyncio
9
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
10
- from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
11
  from transformers.image_utils import load_image
12
  import time
13
 
@@ -82,18 +82,13 @@ def generate(
82
  files = input_dict.get("files", [])
83
 
84
  # Check if input includes image(s)
85
- if len(files) > 1:
86
- images = [load_image(image) for image in files]
87
- elif len(files) == 1:
88
- images = [load_image(files[0])]
89
- else:
90
- images = []
91
 
92
  # Check if message is for TTS
93
  tts_prefix = "@tts"
94
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 7))
95
  voice_index = next((i for i in range(1, 7) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
96
-
97
  if is_tts and voice_index:
98
  voice = TTS_VOICES[voice_index - 1]
99
  text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
@@ -114,7 +109,6 @@ def generate(
114
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
115
  inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
116
 
117
- # Handle generation for multimodal input
118
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
119
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
120
 
@@ -163,8 +157,11 @@ def generate(
163
  yield final_response
164
 
165
  if is_tts and voice:
166
- output_file = asyncio.run(text_to_speech(final_response, voice))
167
- # Return playable audio separately
 
 
 
168
  yield gr.Audio(output_file, autoplay=True)
169
 
170
  demo = gr.ChatInterface(
 
7
  import edge_tts
8
  import asyncio
9
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
10
+ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
11
  from transformers.image_utils import load_image
12
  import time
13
 
 
82
  files = input_dict.get("files", [])
83
 
84
  # Check if input includes image(s)
85
+ images = [load_image(image) for image in files] if files else []
 
 
 
 
 
86
 
87
  # Check if message is for TTS
88
  tts_prefix = "@tts"
89
  is_tts = any(text.strip().lower().startswith(f"{tts_prefix}{i}") for i in range(1, 7))
90
  voice_index = next((i for i in range(1, 7) if text.strip().lower().startswith(f"{tts_prefix}{i}")), None)
91
+
92
  if is_tts and voice_index:
93
  voice = TTS_VOICES[voice_index - 1]
94
  text = text.replace(f"{tts_prefix}{voice_index}", "").strip()
 
109
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
110
  inputs = processor(text=[prompt], images=images, return_tensors="pt", padding=True).to("cuda")
111
 
 
112
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
113
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
114
 
 
157
  yield final_response
158
 
159
  if is_tts and voice:
160
+ loop = asyncio.new_event_loop()
161
+ asyncio.set_event_loop(loop)
162
+ output_file = loop.run_until_complete(text_to_speech(final_response, voice))
163
+
164
+ # Separate yield for audio output
165
  yield gr.Audio(output_file, autoplay=True)
166
 
167
  demo = gr.ChatInterface(