Spaces:
Sleeping
Sleeping
File size: 1,742 Bytes
6fea906 e527a2b 6fea906 46ab128 6fea906 46ab128 6fea906 46ab128 6fea906 46ab128 6fea906 46ab128 6fea906 46ab128 6fea906 46ab128 6fea906 46ab128 6fea906 46ab128 6fea906 46ab128 6fea906 46ab128 6fea906 46ab128 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import torch
from transformers import AutoModel, AutoTokenizer
import gradio as gr
import soundfile as sf
import numpy as np
import tempfile
# Load model and tokenizer
device = "cpu" # Change to "cuda" if you have GPU
model = AutoModel.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True).to(device)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True)
# Speaker IDs for languages
LANG_SPEAKER_MAP = {
"mar": 13, # Marathi Male
"hin": 13, # Reuse Marathi Male for Hindi
"san": 17 # Sanskrit Male
}
DEFAULT_STYLE_ID = 0 # ALEXA
def generate_audio(text, language):
if not text.strip():
return "Error: Text cannot be empty."
speaker_id = LANG_SPEAKER_MAP.get(language.lower())
if speaker_id is None:
return f"Unsupported language: {language}"
inputs = tokenizer(text=text, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model(inputs['input_ids'], speaker_id=speaker_id, emotion_id=DEFAULT_STYLE_ID)
waveform = outputs.waveform.squeeze().cpu().numpy()
sample_rate = model.config.sampling_rate
# Save temp audio
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sf.write(f.name, waveform, sample_rate)
return sample_rate, waveform
# Gradio Interface with clean inputs
iface = gr.Interface(
fn=generate_audio,
inputs=[
gr.Textbox(label="Enter Text"),
gr.Dropdown(["mar", "hin", "san"], label="Select Language")
],
outputs=gr.Audio(label="Generated Audio"),
title="VITS TTS for Indian Languages (Marathi, Hindi, Sanskrit)",
description="Uses ai4bharat/vits_rasa_13. Enter text and select a language."
)
iface.launch()
|