import torch
from transformers import AutoModel, AutoTokenizer
import gradio as gr
import soundfile as sf
import numpy as np
import tempfile

# Load model and tokenizer
device = "cpu"  # or "cuda" if available
model = AutoModel.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True).to(device)
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/vits_rasa_13", trust_remote_code=True)

# Mapping: language -> speaker_id
LANG_SPEAKER_MAP = {
    "asm": 0, "ben": 2, "brx": 4, "doi": 6,
    "kan": 8, "mai": 10, "mal": 11,
    "mar": 13, "nep": 14, "pan": 16,
    "san": 17, "tam": 18, "tel": 19,
    "hin": 13  # use Marathi Male voice for Hindi (close)
}

# Mapping: Style (fixed default)
DEFAULT_STYLE_ID = 0  # ALEXA

def tts_from_json(json_input):
    try:
        text = json_input["text"]
        lang = json_input["language"].lower()

        speaker_id = LANG_SPEAKER_MAP.get(lang)
        if speaker_id is None:
            return f"Language '{lang}' not supported."

        inputs = tokenizer(text=text, return_tensors="pt").to(device)
        outputs = model(inputs['input_ids'], speaker_id=speaker_id, emotion_id=DEFAULT_STYLE_ID)

        waveform = outputs.waveform.squeeze().cpu().numpy()
        sample_rate = model.config.sampling_rate

        # Save to temp file for Gradio playback
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            sf.write(f.name, waveform, sample_rate)
            return sample_rate, waveform
    except Exception as e:
        return f"Error: {str(e)}"

iface = gr.Interface(
    fn=tts_from_json,
    inputs=gr.JSON(label="Input JSON: {'text': '...', 'language': 'mar/hin/san'}"),
    outputs=gr.Audio(label="Generated Audio"),
    title="VITS TTS for Indian Languages (Marathi, Hindi, Sanskrit)",
    description="Uses ai4bharat/vits_rasa_13. Supports Marathi, Hindi, and Sanskrit."
)

iface.launch()