ajsbsd commited on
Commit
a2619b7
·
verified ·
1 Parent(s): 88ac83e

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -517
app.py DELETED
@@ -1,517 +0,0 @@
1
- import gradio as gr
2
- import torch
3
- from transformers import (
4
- AutoTokenizer, AutoModelForCausalLM,
5
- SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan,
6
- WhisperProcessor, WhisperForConditionalGeneration
7
- )
8
- from datasets import load_dataset
9
- import os
10
- import spaces
11
- import tempfile
12
- import soundfile as sf
13
- import librosa
14
- import yaml
15
-
16
- # ================== Configuration ==================
17
- HUGGINGFACE_MODEL_ID = "HuggingFaceH4/Qwen2.5-1.5B-Instruct-gkd"
18
- TORCH_DTYPE = torch.bfloat16
19
- MAX_NEW_TOKENS = 512
20
- DO_SAMPLE = Trueimport gradio as gr
21
- import torch
22
- from transformers import (
23
- AutoTokenizer, AutoModelForCausalLM,
24
- SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan,
25
- WhisperProcessor, WhisperForConditionalGeneration
26
- )
27
- from datasets import load_dataset
28
- import os
29
- import spaces
30
- import tempfile
31
- import soundfile as sf
32
- import librosa
33
- import yaml
34
-
35
- # ================== Configuration ==================
36
- HUGGINGFACE_MODEL_ID = "HuggingFaceH4/Qwen2.5-1.5B-Instruct-gkd"
37
- TORCH_DTYPE = torch.bfloat16
38
- MAX_NEW_TOKENS = 512
39
- DO_SAMPLE = True
40
- TEMPERATURE = 0.7
41
- TOP_K = 50
42
- TOP_P = 0.95
43
-
44
- TTS_MODEL_ID = "microsoft/speecht5_tts"
45
- TTS_VOCODER_ID = "microsoft/speecht5_hifigan"
46
- STT_MODEL_ID = "openai/whisper-small"
47
-
48
- # ================== Global Variables ==================
49
- tokenizer = None
50
- llm_model = None
51
- tts_processor = None
52
- tts_model = None
53
- tts_vocoder = None
54
- speaker_embeddings = None
55
- whisper_processor = None
56
- whisper_model = None
57
- first_load = True
58
-
59
- # ================== UI Helpers ==================
60
- def generate_pretty_html(data):
61
- html = """
62
- <div style="font-family: Arial, sans-serif; max-width: 600px; margin: auto;
63
- background-color: #f9f9f9; border-radius: 10px; padding: 20px;
64
- box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
65
- <h2 style="color: #2c3e50; border-bottom: 2px solid #ddd; padding-bottom: 10px;">Model Info</h2>
66
- """
67
- for key, value in data.items():
68
- html += f"""
69
- <div style="margin-bottom: 12px;">
70
- <strong style="color: #34495e; display: inline-block; width: 160px;">{key}:</strong>
71
- <span style="color: #2c3e50;">{value}</span>
72
- </div>
73
- """
74
- html += "</div>"
75
- return html
76
-
77
- def load_config():
78
- with open("config.yaml", "r", encoding="utf-8") as f:
79
- return yaml.safe_load(f) # Loads only the first document
80
-
81
- def render_modern_info():
82
- try:
83
- config = load_config()
84
- return generate_pretty_html(config)
85
- except Exception as e:
86
- return f"<div style='color: red;'>Error loading config: {str(e)}</div>"
87
-
88
- def load_readme():
89
- with open("README.md", "r", encoding="utf-8") as f:
90
- return f.read()
91
-
92
- # ================== Helper Functions ==================
93
- def split_text_into_chunks(text, max_chars=400):
94
- sentences = text.replace("...", ".").split(". ")
95
- chunks = []
96
- current_chunk = ""
97
- for sentence in sentences:
98
- if len(current_chunk) + len(sentence) + 2 < max_chars:
99
- current_chunk += ". " + sentence if current_chunk else sentence
100
- else:
101
- chunks.append(current_chunk)
102
- current_chunk = sentence
103
- if current_chunk:
104
- chunks.append(current_chunk)
105
- return [f"{chunk}." for chunk in chunks if chunk.strip()]
106
-
107
- # ================== Model Loading ==================
108
- @spaces.GPU
109
- def load_models():
110
- global tokenizer, llm_model, tts_processor, tts_model, tts_vocoder, speaker_embeddings, whisper_processor, whisper_model
111
- hf_token = os.environ.get("HF_TOKEN")
112
-
113
- # LLM
114
- if tokenizer is None or llm_model is None:
115
- try:
116
- tokenizer = AutoTokenizer.from_pretrained(HUGGINGFACE_MODEL_ID, token=hf_token)
117
- if tokenizer.pad_token is None:
118
- tokenizer.pad_token = tokenizer.eos_token
119
- llm_model = AutoModelForCausalLM.from_pretrained(
120
- HUGGINGFACE_MODEL_ID,
121
- torch_dtype=TORCH_DTYPE,
122
- device_map="auto",
123
- token=hf_token
124
- ).eval()
125
- print("LLM loaded successfully.")
126
- except Exception as e:
127
- print(f"Error loading LLM: {e}")
128
-
129
- # TTS
130
- if tts_processor is None or tts_model is None or tts_vocoder is None:
131
- try:
132
- tts_processor = SpeechT5Processor.from_pretrained(TTS_MODEL_ID, token=hf_token)
133
- tts_model = SpeechT5ForTextToSpeech.from_pretrained(TTS_MODEL_ID, token=hf_token)
134
- tts_vocoder = SpeechT5HifiGan.from_pretrained(TTS_VOCODER_ID, token=hf_token)
135
- embeddings = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation", token=hf_token)
136
- speaker_embeddings = torch.tensor(embeddings[7306]["xvector"]).unsqueeze(0)
137
- device = llm_model.device if llm_model else 'cpu'
138
- tts_model.to(device)
139
- tts_vocoder.to(device)
140
- speaker_embeddings = speaker_embeddings.to(device)
141
- print("TTS models loaded.")
142
- except Exception as e:
143
- print(f"Error loading TTS: {e}")
144
-
145
- # STT
146
- if whisper_processor is None or whisper_model is None:
147
- try:
148
- whisper_processor = WhisperProcessor.from_pretrained(STT_MODEL_ID, token=hf_token)
149
- whisper_model = WhisperForConditionalGeneration.from_pretrained(STT_MODEL_ID, token=hf_token)
150
- device = llm_model.device if llm_model else 'cpu'
151
- whisper_model.to(device)
152
- print("Whisper loaded.")
153
- except Exception as e:
154
- print(f"Error loading Whisper: {e}")
155
-
156
- # ================== Chat & Audio Functions ==================
157
- @spaces.GPU
158
- def generate_response_and_audio(message, history):
159
- global first_load
160
- if first_load:
161
- load_models()
162
- first_load = False
163
-
164
- global tokenizer, llm_model, tts_processor, tts_model, tts_vocoder, speaker_embeddings
165
-
166
- if tokenizer is None or llm_model is None:
167
- return [{"role": "assistant", "content": "Error: LLM not loaded."}], None
168
-
169
- messages = history.copy()
170
- messages.append({"role": "user", "content": message})
171
-
172
- try:
173
- input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
174
- except:
175
- input_text = ""
176
- for item in history:
177
- input_text += f"{item['role'].capitalize()}: {item['content']}\n"
178
- input_text += f"User: {message}\nAssistant:"
179
-
180
- try:
181
- inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(llm_model.device)
182
- output_ids = llm_model.generate(
183
- inputs["input_ids"],
184
- attention_mask=inputs["attention_mask"],
185
- max_new_tokens=MAX_NEW_TOKENS,
186
- do_sample=DO_SAMPLE,
187
- temperature=TEMPERATURE,
188
- top_k=TOP_K,
189
- top_p=TOP_P,
190
- pad_token_id=tokenizer.eos_token_id
191
- )
192
- generated_text = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True).strip()
193
- except Exception as e:
194
- print(f"LLM error: {e}")
195
- return history + [{"role": "assistant", "content": "I had an issue generating a response."}], None
196
-
197
- audio_path = None
198
- if None not in [tts_processor, tts_model, tts_vocoder, speaker_embeddings]:
199
- try:
200
- device = llm_model.device
201
- text_chunks = split_text_into_chunks(generated_text)
202
-
203
- full_speech = []
204
- for chunk in text_chunks:
205
- tts_inputs = tts_processor(text=chunk, return_tensors="pt", max_length=512, truncation=True).to(device)
206
- speech = tts_model.generate_speech(tts_inputs["input_ids"], speaker_embeddings, vocoder=tts_vocoder)
207
- full_speech.append(speech.cpu())
208
-
209
- full_speech_tensor = torch.cat(full_speech, dim=0)
210
-
211
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
212
- audio_path = tmp_file.name
213
- sf.write(audio_path, full_speech_tensor.numpy(), samplerate=16000)
214
-
215
- except Exception as e:
216
- print(f"TTS error: {e}")
217
-
218
- return history + [{"role": "assistant", "content": generated_text}], audio_path
219
-
220
- @spaces.GPU
221
- def transcribe_audio(filepath):
222
- global first_load
223
- if first_load:
224
- load_models()
225
- first_load = False
226
-
227
- global whisper_processor, whisper_model
228
- if whisper_model is None:
229
- return "Whisper model not loaded."
230
-
231
- try:
232
- audio, sr = librosa.load(filepath, sr=16000)
233
- inputs = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(whisper_model.device)
234
- outputs = whisper_model.generate(inputs)
235
- return whisper_processor.batch_decode(outputs, skip_special_tokens=True)[0]
236
- except Exception as e:
237
- return f"Transcription failed: {e}"
238
-
239
- # ================== Gradio UI ==================
240
- with gr.Blocks() as demo:
241
- gr.Markdown("# Qwen2.5 Chatbot with Voice Input/Output")
242
-
243
- with gr.Tab("Chat"):
244
- chatbot = gr.Chatbot(type='messages')
245
- text_input = gr.Textbox(placeholder="Type your message...")
246
- audio_output = gr.Audio(label="Response Audio", autoplay=True)
247
- text_input.submit(generate_response_and_audio, [text_input, chatbot], [chatbot, audio_output])
248
-
249
- with gr.Tab("Transcribe"):
250
- audio_input = gr.Audio(type="filepath", label="Upload Audio")
251
- transcribed = gr.Textbox(label="Transcription")
252
- audio_input.upload(transcribe_audio, audio_input, transcribed)
253
-
254
- clear_btn = gr.Button("Clear All")
255
- clear_btn.click(lambda: ([], "", None), None, [chatbot, text_input, audio_output])
256
-
257
- #gr.Markdown(load_readme())
258
- #gr.Markdown("---")
259
-
260
- # ✅ Define html_output BEFORE using it
261
- html_output = gr.HTML("<div style='text-align:center; padding: 20px;'>Loading model info...</div>")
262
-
263
- # ✅ Now this works!
264
- demo.load(fn=render_modern_info, outputs=html_output)
265
-
266
- # ================== Launch App ==================
267
- demo.queue().launch()
268
- TEMPERATURE = 0.7
269
- TOP_K = 50
270
- TOP_P = 0.95
271
-
272
- TTS_MODEL_ID = "microsoft/speecht5_tts"
273
- TTS_VOCODER_ID = "microsoft/speecht5_hifigan"
274
- STT_MODEL_ID = "openai/whisper-small"
275
-
276
- # ================== Global Variables ==================
277
- tokenizer = None
278
- llm_model = None
279
- tts_processor = None
280
- tts_model = None
281
- tts_vocoder = None
282
- speaker_embeddings = None
283
- whisper_processor = None
284
- whisper_model = None
285
- first_load = True
286
-
287
- # ================== UI Helpers ==================
288
- def generate_pretty_html(data):
289
- html = """
290
- <div style="font-family: Arial, sans-serif; max-width: 600px; margin: auto;
291
- background-color: #f9f9f9; border-radius: 10px; padding: 20px;
292
- box-shadow: 0 4px 12px rgba(0,0,0,0.1);">
293
- <h2 style="color: #2c3e50; border-bottom: 2px solid #ddd; padding-bottom: 10px;">Model Info</h2>
294
- """
295
- for key, value in data.items():
296
- html += f"""
297
- <div style="margin-bottom: 12px;">
298
- <strong style="color: #34495e; display: inline-block; width: 160px;">{key}:</strong>
299
- <span style="color: #2c3e50;">{value}</span>
300
- </div>
301
- """
302
- html += "</div>"
303
- return html
304
-
305
- def load_config():
306
- with open("config.yaml", "r", encoding="utf-8") as f:
307
- return yaml.safe_load(f) # Loads only the first document
308
-
309
- def render_modern_info():
310
- try:
311
- config = load_config()
312
- return generate_pretty_html(config)
313
- except Exception as e:
314
- return f"<div style='color: red;'>Error loading config: {str(e)}</div>"
315
-
316
- def load_readme():
317
- with open("README.md", "r", encoding="utf-8") as f:
318
- return f.read()
319
-
320
- # ================== Helper Functions ==================
321
- def split_text_into_chunks(text, max_chars=400):
322
- sentences = text.replace("...", ".").split(". ")
323
- chunks = []
324
- current_chunk = ""
325
- for sentence in sentences:
326
- if len(current_chunk) + len(sentence) + 2 < max_chars:
327
- current_chunk += ". " + sentence if current_chunk else sentence
328
- else:
329
- chunks.append(current_chunk)
330
- current_chunk = sentence
331
- if current_chunk:
332
- chunks.append(current_chunk)
333
- return [f"{chunk}." for chunk in chunks if chunk.strip()]
334
-
335
- # ================== Model Loading ==================
336
- @spaces.GPU
337
- def load_models():
338
- global tokenizer, llm_model, tts_processor, tts_model, tts_vocoder, speaker_embeddings, whisper_processor, whisper_model
339
- hf_token = os.environ.get("HF_TOKEN")
340
-
341
- # LLM
342
- if tokenizer is None or llm_model is None:
343
- try:
344
- tokenizer = AutoTokenizer.from_pretrained(HUGGINGFACE_MODEL_ID, token=hf_token)
345
- if tokenizer.pad_token is None:
346
- tokenizer.pad_token = tokenizer.eos_token
347
- llm_model = AutoModelForCausalLM.from_pretrained(
348
- HUGGINGFACE_MODEL_ID,
349
- torch_dtype=TORCH_DTYPE,
350
- device_map="auto",
351
- token=hf_token
352
- ).eval()
353
- print("LLM loaded successfully.")
354
- except Exception as e:
355
- print(f"Error loading LLM: {e}")
356
-
357
- # TTS
358
- if tts_processor is None or tts_model is None or tts_vocoder is None:
359
- try:
360
- tts_processor = SpeechT5Processor.from_pretrained(TTS_MODEL_ID, token=hf_token)
361
- tts_model = SpeechT5ForTextToSpeech.from_pretrained(TTS_MODEL_ID, token=hf_token)
362
- tts_vocoder = SpeechT5HifiGan.from_pretrained(TTS_VOCODER_ID, token=hf_token)
363
- embeddings = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation", token=hf_token)
364
- speaker_embeddings = torch.tensor(embeddings[7306]["xvector"]).unsqueeze(0)
365
- device = llm_model.device if llm_model else 'cpu'
366
- tts_model.to(device)
367
- tts_vocoder.to(device)
368
- speaker_embeddings = speaker_embeddings.to(device)
369
- print("TTS models loaded.")
370
- except Exception as e:
371
- print(f"Error loading TTS: {e}")
372
-
373
- # STT
374
- if whisper_processor is None or whisper_model is None:
375
- try:
376
- whisper_processor = WhisperProcessor.from_pretrained(STT_MODEL_ID, token=hf_token)
377
- whisper_model = WhisperForConditionalGeneration.from_pretrained(STT_MODEL_ID, token=hf_token)
378
- device = llm_model.device if llm_model else 'cpu'
379
- whisper_model.to(device)
380
- print("Whisper loaded.")
381
- except Exception as e:
382
- print(f"Error loading Whisper: {e}")
383
-
384
- # ================== Chat & Audio Functions ==================
385
- @spaces.GPU
386
- def generate_response_and_audio(message, history):
387
- global first_load
388
- if first_load:
389
- load_models()
390
- first_load = False
391
-
392
- global tokenizer, llm_model, tts_processor, tts_model, tts_vocoder, speaker_embeddings
393
-
394
- if tokenizer is None or llm_model is None:
395
- return [{"role": "assistant", "content": "Error: LLM not loaded."}], None
396
-
397
- messages = history.copy()
398
- messages.append({"role": "user", "content": message})
399
-
400
- try:
401
- input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
402
- except:
403
- input_text = ""
404
- for item in history:
405
- input_text += f"{item['role'].capitalize()}: {item['content']}\n"
406
- input_text += f"User: {message}\nAssistant:"
407
-
408
- try:
409
- inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(llm_model.device)
410
- output_ids = llm_model.generate(
411
- inputs["input_ids"],
412
- attention_mask=inputs["attention_mask"],
413
- max_new_tokens=MAX_NEW_TOKENS,
414
- do_sample=DO_SAMPLE,
415
- temperature=TEMPERATURE,
416
- top_k=TOP_K,
417
- top_p=TOP_P,
418
- pad_token_id=tokenizer.eos_token_id
419
- )
420
- generated_text = tokenizer.decode(output_ids[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True).strip()
421
- except Exception as e:
422
- print(f"LLM error: {e}")
423
- return history + [{"role": "assistant", "content": "I had an issue generating a response."}], None
424
-
425
- audio_path = None
426
- if None not in [tts_processor, tts_model, tts_vocoder, speaker_embeddings]:
427
- try:
428
- device = llm_model.device
429
- text_chunks = split_text_into_chunks(generated_text)
430
-
431
- full_speech = []
432
- for chunk in text_chunks:
433
- tts_inputs = tts_processor(text=chunk, return_tensors="pt", max_length=512, truncation=True).to(device)
434
- speech = tts_model.generate_speech(tts_inputs["input_ids"], speaker_embeddings, vocoder=tts_vocoder)
435
- full_speech.append(speech.cpu())
436
-
437
- full_speech_tensor = torch.cat(full_speech, dim=0)
438
-
439
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
440
- audio_path = tmp_file.name
441
- sf.write(audio_path, full_speech_tensor.numpy(), samplerate=16000)
442
-
443
- except Exception as e:
444
- print(f"TTS error: {e}")
445
-
446
- return history + [{"role": "assistant", "content": generated_text}], audio_path
447
-
448
- @spaces.GPU
449
- def transcribe_audio(filepath):
450
- global first_load
451
- if first_load:
452
- load_models()
453
- first_load = False
454
-
455
- global whisper_processor, whisper_model
456
- if whisper_model is None:
457
- return "Whisper model not loaded."
458
-
459
- try:
460
- audio, sr = librosa.load(filepath, sr=16000)
461
- inputs = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(whisper_model.device)
462
- outputs = whisper_model.generate(inputs)
463
- return whisper_processor.batch_decode(outputs, skip_special_tokens=True)[0]
464
- except Exception as e:
465
- return f"Transcription failed: {e}"
466
-
467
- # ================== Gradio UI ==================
468
- with gr.Blocks(head="""
469
- <script src="https://cdn.tailwindcss.com "></script>
470
- """) as demo:
471
- gr.Markdown("""
472
- <div class="bg-gray-900 text-white p-4 rounded-lg shadow-md mb-6">
473
- <h1 class="text-2xl font-bold">Qwen2.5 Chatbot with Voice Input/Output</h1>
474
- <p class="text-gray-300">Powered by Gradio + TailwindCSS</p>
475
- </div>
476
- """)
477
-
478
- with gr.Tab("Chat"):
479
- gr.HTML("""
480
- <div class="bg-gray-800 p-4 rounded-lg mb-4">
481
- <label class="block text-gray-300 font-medium mb-2">Chat Interface</label>
482
- </div>
483
- """)
484
- chatbot = gr.Chatbot(type='messages', elem_classes=["bg-gray-800", "text-white"])
485
- text_input = gr.Textbox(
486
- placeholder="Type your message...",
487
- label="User Input",
488
- elem_classes=["bg-gray-700", "text-white", "border-gray-600"]
489
- )
490
- audio_output = gr.Audio(label="Response Audio", autoplay=True)
491
- text_input.submit(generate_response_and_audio, [text_input, chatbot], [chatbot, audio_output])
492
-
493
- with gr.Tab("Transcribe"):
494
- gr.HTML("""
495
- <div class="bg-gray-800 p-4 rounded-lg mb-4">
496
- <label class="block text-gray-300 font-medium mb-2">Audio Transcription</label>
497
- </div>
498
- """)
499
- audio_input = gr.Audio(type="filepath", label="Upload Audio")
500
- transcribed = gr.Textbox(
501
- label="Transcription",
502
- elem_classes=["bg-gray-700", "text-white", "border-gray-600"]
503
- )
504
- audio_input.upload(transcribe_audio, audio_input, transcribed)
505
-
506
- clear_btn = gr.Button("Clear All", elem_classes=["bg-gray-600", "hover:bg-gray-500", "text-white", "mt-4"])
507
- clear_btn.click(lambda: ([], "", None), None, [chatbot, text_input, audio_output])
508
-
509
- html_output = gr.HTML("""
510
- <div class="bg-gray-800 text-white p-4 rounded-lg mt-6 text-center">
511
- Loading model info...
512
- </div>
513
- """)
514
- demo.load(fn=render_modern_info, outputs=html_output)
515
-
516
- # ================== Launch App ==================
517
- demo.queue().launch()