tts / app.py
kunalpro379's picture
Update app.py
6fea906 verified
raw
history blame
1.89 kB
import torch
from transformers import AutoModel, AutoTokenizer
import gradio as gr
import soundfile as sf
import numpy as np
import tempfile
# Load model and tokenizer
device = "cpu" # or "cuda" if available
model = AutoModel.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True).to(device)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True)
# Mapping: language -> speaker_id
LANG_SPEAKER_MAP = {
"asm": 0, "ben": 2, "brx": 4, "doi": 6,
"kan": 8, "mai": 10, "mal": 11,
"mar": 13, "nep": 14, "pan": 16,
"san": 17, "tam": 18, "tel": 19,
"hin": 13 # use Marathi Male voice for Hindi (close)
}
# Mapping: Style (fixed default)
DEFAULT_STYLE_ID = 0 # ALEXA
def tts_from_json(json_input):
try:
text = json_input["text"]
lang = json_input["language"].lower()
speaker_id = LANG_SPEAKER_MAP.get(lang)
if speaker_id is None:
return f"Language '{lang}' not supported."
inputs = tokenizer(text=text, return_tensors="pt").to(device)
outputs = model(inputs['input_ids'], speaker_id=speaker_id, emotion_id=DEFAULT_STYLE_ID)
waveform = outputs.waveform.squeeze().cpu().numpy()
sample_rate = model.config.sampling_rate
# Save to temp file for Gradio playback
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sf.write(f.name, waveform, sample_rate)
return sample_rate, waveform
except Exception as e:
return f"Error: {str(e)}"
iface = gr.Interface(
fn=tts_from_json,
inputs=gr.JSON(label="Input JSON: {'text': '...', 'language': 'mar/hin/san'}"),
outputs=gr.Audio(label="Generated Audio"),
title="VITS TTS for Indian Languages (Marathi, Hindi, Sanskrit)",
description="Uses ai4bharat/vits_rasa_13. Supports Marathi, Hindi, and Sanskrit."
)
iface.launch()