import torch from transformers import AutoModel, AutoTokenizer import gradio as gr import soundfile as sf import numpy as np import tempfile # Load model and tokenizer device = "cpu" # or "cuda" if available model = AutoModel.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True).to(device) tokenizer = AutoTokenizer.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True) # Mapping: language -> speaker_id LANG_SPEAKER_MAP = { "asm": 0, "ben": 2, "brx": 4, "doi": 6, "kan": 8, "mai": 10, "mal": 11, "mar": 13, "nep": 14, "pan": 16, "san": 17, "tam": 18, "tel": 19, "hin": 13 # use Marathi Male voice for Hindi (close) } # Mapping: Style (fixed default) DEFAULT_STYLE_ID = 0 # ALEXA def tts_from_json(json_input): try: text = json_input["text"] lang = json_input["language"].lower() speaker_id = LANG_SPEAKER_MAP.get(lang) if speaker_id is None: return f"Language '{lang}' not supported." inputs = tokenizer(text=text, return_tensors="pt").to(device) outputs = model(inputs['input_ids'], speaker_id=speaker_id, emotion_id=DEFAULT_STYLE_ID) waveform = outputs.waveform.squeeze().cpu().numpy() sample_rate = model.config.sampling_rate # Save to temp file for Gradio playback with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: sf.write(f.name, waveform, sample_rate) return sample_rate, waveform except Exception as e: return f"Error: {str(e)}" iface = gr.Interface( fn=tts_from_json, inputs=gr.JSON(label="Input JSON: {'text': '...', 'language': 'mar/hin/san'}"), outputs=gr.Audio(label="Generated Audio"), title="VITS TTS for Indian Languages (Marathi, Hindi, Sanskrit)", description="Uses ai4bharat/vits_rasa_13. Supports Marathi, Hindi, and Sanskrit." ) iface.launch()