import torch import gradio as gr from PIL import Image import scipy.io.wavfile as wavfile import numpy as np # Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") # Use a pipeline as a high-level helper from transformers import pipeline narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") def generate_audio(text): narrated_text = narrator(text) wavfile.write("output.wav", rate=narrated_text['sampling_rate'], data= narrated_text['audio'][0]) return 'output.wav' def caption_my_image(imagee): # Ensure NumPy is imported and correctly referenced if isinstance(imagee, np.ndarray): imagee = Image.fromarray(imagee) # Convert NumPy array to PIL Image elif not isinstance(imagee, Image.Image): raise TypeError("Unsupported image format. Please upload a valid image.") imagee = imagee.convert('RGB') caption = pipe(imagee) final_caption = caption[0]['generated_text'] return generate_audio(final_caption) demo = gr.Interface(fn=caption_my_image, inputs=[gr.Image(label='Upload an image to know the story behind it')], outputs=[gr.Audio(label='Play the narration of an image')], title="Here Image narration in real time", description='This will narrate the description of the image' ) demo.launch(share='True', debug=True)