import torch 
import gradio as gr
from PIL import Image
import scipy.io.wavfile as wavfile
import numpy as np

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

# Use a pipeline as a high-level helper
from transformers import pipeline

narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")

def generate_audio(text):
    narrated_text = narrator(text)
    wavfile.write("output.wav", rate=narrated_text['sampling_rate'],
                 data= narrated_text['audio'][0])
    return 'output.wav'


def caption_my_image(imagee):
    # Ensure NumPy is imported and correctly referenced
    if isinstance(imagee, np.ndarray):  
        imagee = Image.fromarray(imagee)  # Convert NumPy array to PIL Image
    elif not isinstance(imagee, Image.Image):  
        raise TypeError("Unsupported image format. Please upload a valid image.")

    imagee = imagee.convert('RGB')
    caption = pipe(imagee)
    final_caption = caption[0]['generated_text']
    return generate_audio(final_caption)

demo = gr.Interface(fn=caption_my_image,
                    inputs=[gr.Image(label='Upload an image to know the story behind it')],
                    outputs=[gr.Audio(label='Play the narration of an image')],
                    title="Here Image narration in real time",
                    description='This will narrate the description of the image'
    
)

demo.launch(share='True', debug=True)