Spaces:
Running
Running
File size: 1,894 Bytes
6fea906 e527a2b 6fea906 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import torch
from transformers import AutoModel, AutoTokenizer
import gradio as gr
import soundfile as sf
import numpy as np
import tempfile
# Load model and tokenizer
device = "cpu" # or "cuda" if available
model = AutoModel.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True).to(device)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True)
# Mapping: language -> speaker_id
LANG_SPEAKER_MAP = {
"asm": 0, "ben": 2, "brx": 4, "doi": 6,
"kan": 8, "mai": 10, "mal": 11,
"mar": 13, "nep": 14, "pan": 16,
"san": 17, "tam": 18, "tel": 19,
"hin": 13 # use Marathi Male voice for Hindi (close)
}
# Mapping: Style (fixed default)
DEFAULT_STYLE_ID = 0 # ALEXA
def tts_from_json(json_input):
try:
text = json_input["text"]
lang = json_input["language"].lower()
speaker_id = LANG_SPEAKER_MAP.get(lang)
if speaker_id is None:
return f"Language '{lang}' not supported."
inputs = tokenizer(text=text, return_tensors="pt").to(device)
outputs = model(inputs['input_ids'], speaker_id=speaker_id, emotion_id=DEFAULT_STYLE_ID)
waveform = outputs.waveform.squeeze().cpu().numpy()
sample_rate = model.config.sampling_rate
# Save to temp file for Gradio playback
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sf.write(f.name, waveform, sample_rate)
return sample_rate, waveform
except Exception as e:
return f"Error: {str(e)}"
iface = gr.Interface(
fn=tts_from_json,
inputs=gr.JSON(label="Input JSON: {'text': '...', 'language': 'mar/hin/san'}"),
outputs=gr.Audio(label="Generated Audio"),
title="VITS TTS for Indian Languages (Marathi, Hindi, Sanskrit)",
description="Uses ai4bharat/vits_rasa_13. Supports Marathi, Hindi, and Sanskrit."
)
iface.launch()
|