import torch from transformers import AutoModel, AutoTokenizer import gradio as gr import soundfile as sf import numpy as np import tempfile # Load model and tokenizer device = "cpu" # Change to "cuda" if you have GPU model = AutoModel.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True).to(device) tokenizer = AutoTokenizer.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True) # Speaker IDs for languages LANG_SPEAKER_MAP = { "mar": 13, # Marathi Male "hin": 13, # Reuse Marathi Male for Hindi "san": 17 # Sanskrit Male } DEFAULT_STYLE_ID = 0 # ALEXA def generate_audio(text, language): if not text.strip(): return "Error: Text cannot be empty." speaker_id = LANG_SPEAKER_MAP.get(language.lower()) if speaker_id is None: return f"Unsupported language: {language}" inputs = tokenizer(text=text, return_tensors="pt").to(device) with torch.no_grad(): outputs = model(inputs['input_ids'], speaker_id=speaker_id, emotion_id=DEFAULT_STYLE_ID) waveform = outputs.waveform.squeeze().cpu().numpy() sample_rate = model.config.sampling_rate # Save temp audio with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: sf.write(f.name, waveform, sample_rate) return sample_rate, waveform # Gradio Interface with clean inputs iface = gr.Interface( fn=generate_audio, inputs=[ gr.Textbox(label="Enter Text"), gr.Dropdown(["mar", "hin", "san"], label="Select Language") ], outputs=gr.Audio(label="Generated Audio"), title="VITS TTS for Indian Languages (Marathi, Hindi, Sanskrit)", description="Uses ai4bharat/vits_rasa_13. Enter text and select a language." ) iface.launch()