import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa

# Load the fine-tuned Whisper model and processor
model_name = "hackergeek98/tinyyyy_whisper"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Define the ASR function
def transcribe_audio(audio_file):
    # Load audio file using librosa (supports multiple formats)
    audio_data, sampling_rate = librosa.load(audio_file, sr=16000)  # Resample to 16kHz

    # Preprocess the audio
    inputs = processor(audio_data, sampling_rate=sampling_rate, return_tensors="pt").input_features.to(device)

    # Generate transcription
    with torch.no_grad():
        predicted_ids = model.generate(inputs)

    # Decode the transcription
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

# Create the Gradio interface
interface = gr.Interface(
    fn=transcribe_audio,  # Function to call
    inputs=gr.Audio(type="filepath"),  # Input: Upload audio file (any format)
    outputs=gr.Textbox(label="Transcription"),  # Output: Display transcription
    title="Whisper ASR: Tinyyyy Model",
    description="Upload an audio file (e.g., .wav, .mp3, .ogg), and the fine-tuned Whisper model will transcribe it.",
)

# Launch the app
interface.launch()