import gradio as gr
import torch
import torchaudio
import numpy as np
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datetime import timedelta
import os
import shutil
from pathlib import Path

# Load Silero VAD
vad_model, utils = torch.hub.load(
    repo_or_dir='snakers4/silero-vad', model='silero_vad', trust_repo=True
)
(get_speech_ts, _, _, _, _) = utils

# Load Wav2Vec2 model
model_name = "ganga4364/Garchen_Rinpoche-wav2vec2-Checkpoint-19000"
model = Wav2Vec2ForCTC.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name)
model.eval()

SAMPLE_RATE = 16000

def format_timestamp(seconds, format_type="srt"):
    """Convert seconds to SRT or WebVTT timestamp format"""
    td = timedelta(seconds=seconds)
    hours = td.seconds // 3600
    minutes = (td.seconds % 3600) // 60
    seconds = td.seconds % 60
    milliseconds = round(td.microseconds / 1000)
    
    if format_type == "srt":
        return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
    else:  # webvtt
        return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"

def create_subtitle_file(timestamps_with_text, output_path, format_type="srt"):
    """Create SRT or WebVTT subtitle file"""
    with open(output_path, 'w', encoding='utf-8') as f:
        if format_type == "vtt":
            f.write("WEBVTT\n\n")
        
        for i, (start_time, end_time, text) in enumerate(timestamps_with_text, 1):
            if format_type == "srt":
                f.write(f"{i}\n")
                f.write(f"{format_timestamp(start_time/SAMPLE_RATE)} --> {format_timestamp(end_time/SAMPLE_RATE)}\n")
                f.write(f"{text}\n\n")
            else:
                f.write(f"{format_timestamp(start_time/SAMPLE_RATE, 'vtt')} --> {format_timestamp(end_time/SAMPLE_RATE, 'vtt')}\n")
                f.write(f"{text}\n\n")

def create_preview_html(audio_path, vtt_path):
    """Create an HTML preview file for audio with subtitles"""
    static_dir = Path("static")
    static_dir.mkdir(exist_ok=True)
    
    # Copy files to static directory with friendly names
    audio_filename = Path(audio_path).name
    vtt_filename = Path(vtt_path).name
    new_audio_path = static_dir / audio_filename
    new_vtt_path = static_dir / vtt_filename
    
    shutil.copy2(audio_path, new_audio_path)
    shutil.copy2(vtt_path, new_vtt_path)
    
    # Read HTML template
    with open("templates/player.html", "r") as f:
        html_content = f.read()
    
    # Replace placeholders
    html_content = html_content.replace("{{ audio_path }}", f"static/{audio_filename}")
    html_content = html_content.replace("{{ vtt_path }}", f"static/{vtt_filename}")
    
    # Save preview HTML
    preview_path = static_dir / "preview.html"
    with open(preview_path, "w") as f:
        f.write(html_content)
    
    return str(preview_path)

def transcribe_with_vad(audio_path):
    # Load and resample audio to 16kHz mono
    wav, sr = torchaudio.load(audio_path)
    if sr != SAMPLE_RATE:
        wav = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(wav)
    wav = wav.mean(dim=0)  # convert to mono
    wav_np = wav.numpy()

    # Get speech timestamps using Silero VAD
    speech_timestamps = get_speech_ts(wav_np, vad_model, sampling_rate=SAMPLE_RATE)
    if not speech_timestamps:
        return "No speech detected.", None, None, None

    timestamps_with_text = []
    transcriptions = []
    
    for ts in speech_timestamps:
        start, end = ts['start'], ts['end']
        segment = wav[start:end]
        if segment.dim() > 1:
            segment = segment.squeeze()

        inputs = processor(segment, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
        with torch.no_grad():
            logits = model(**inputs).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.decode(predicted_ids[0])
        transcriptions.append(transcription)
        timestamps_with_text.append((start, end, transcription))

    # Generate subtitle files
    base_path = os.path.splitext(audio_path)[0]
    srt_path = f"{base_path}.srt"
    vtt_path = f"{base_path}.vtt"
    
    create_subtitle_file(timestamps_with_text, srt_path, "srt")
    create_subtitle_file(timestamps_with_text, vtt_path, "vtt")
    
    # Create preview HTML
    preview_html = create_preview_html(audio_path, vtt_path)

    return " ".join(transcriptions), srt_path, vtt_path, preview_html

# Gradio Interface
demo = gr.Interface(
    fn=transcribe_with_vad,
    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload or Record"),
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.File(label="SRT Subtitle File"),
        gr.File(label="WebVTT Subtitle File"),
        gr.HTML(label="Preview Player")
    ],
    title="Smart Speech-to-Text with VAD and Subtitles",
    description="Transcribe long audio using ganga4364/Garchen_Rinpoche-wav2vec2-Checkpoint-19000 and Silero VAD. Generates SRT and WebVTT subtitle files."
)

if __name__ == "__main__":
    demo.launch(share=True)