Spaces:
Running
Running
import torch | |
from transformers import AutoModel, AutoTokenizer | |
import gradio as gr | |
import soundfile as sf | |
import numpy as np | |
import tempfile | |
# Load model and tokenizer | |
device = "cpu" # or "cuda" if available | |
model = AutoModel.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True).to(device) | |
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True) | |
# Mapping: language -> speaker_id | |
LANG_SPEAKER_MAP = { | |
"asm": 0, "ben": 2, "brx": 4, "doi": 6, | |
"kan": 8, "mai": 10, "mal": 11, | |
"mar": 13, "nep": 14, "pan": 16, | |
"san": 17, "tam": 18, "tel": 19, | |
"hin": 13 # use Marathi Male voice for Hindi (close) | |
} | |
# Mapping: Style (fixed default) | |
DEFAULT_STYLE_ID = 0 # ALEXA | |
def tts_from_json(json_input): | |
try: | |
text = json_input["text"] | |
lang = json_input["language"].lower() | |
speaker_id = LANG_SPEAKER_MAP.get(lang) | |
if speaker_id is None: | |
return f"Language '{lang}' not supported." | |
inputs = tokenizer(text=text, return_tensors="pt").to(device) | |
outputs = model(inputs['input_ids'], speaker_id=speaker_id, emotion_id=DEFAULT_STYLE_ID) | |
waveform = outputs.waveform.squeeze().cpu().numpy() | |
sample_rate = model.config.sampling_rate | |
# Save to temp file for Gradio playback | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
sf.write(f.name, waveform, sample_rate) | |
return sample_rate, waveform | |
except Exception as e: | |
return f"Error: {str(e)}" | |
iface = gr.Interface( | |
fn=tts_from_json, | |
inputs=gr.JSON(label="Input JSON: {'text': '...', 'language': 'mar/hin/san'}"), | |
outputs=gr.Audio(label="Generated Audio"), | |
title="VITS TTS for Indian Languages (Marathi, Hindi, Sanskrit)", | |
description="Uses ai4bharat/vits_rasa_13. Supports Marathi, Hindi, and Sanskrit." | |
) | |
iface.launch() | |