import gradio as gr import torch from transformers import ( AutoTokenizer, AutoModelForCausalLM, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, WhisperProcessor, WhisperForConditionalGeneration ) from datasets import load_dataset import os import spaces import tempfile import soundfile as sf import librosa import yaml # ================== Configuration ================== HUGGINGFACE_MODEL_ID = "HuggingFaceH4/Qwen2.5-1.5B-Instruct-gkd" TORCH_DTYPE = torch.bfloat16 MAX_NEW_TOKENS = 512 DO_SAMPLE = True TEMPERATURE = 0.7 TOP_K = 50 TOP_P = 0.95 TTS_MODEL_ID = "microsoft/speecht5_tts" TTS_VOCODER_ID = "microsoft/speecht5_hifigan" STT_MODEL_ID = "openai/whisper-small" # ================== Global Variables ================== tokenizer = None llm_model = None tts_processor = None tts_model = None tts_vocoder = None speaker_embeddings = None whisper_processor = None whisper_model = None first_load = True ### UI Helpers def generate_pretty_html(data): html = """
Powered by Gradio + TailwindCSS