someshb07's picture
Create app.py
7a0d264 verified
raw
history blame
1.52 kB
import torch
import gradio as gr
from PIL import Image
import scipy.io.wavfile as wavfile
import numpy as np
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
# Use a pipeline as a high-level helper
from transformers import pipeline
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs")
def generate_audio(text):
narrated_text = narrator(text)
wavfile.write("output.wav", rate=narrated_text['sampling_rate'],
data= narrated_text['audio'][0])
return 'output.wav'
def caption_my_image(imagee):
# Ensure NumPy is imported and correctly referenced
if isinstance(imagee, np.ndarray):
imagee = Image.fromarray(imagee) # Convert NumPy array to PIL Image
elif not isinstance(imagee, Image.Image):
raise TypeError("Unsupported image format. Please upload a valid image.")
imagee = imagee.convert('RGB')
caption = pipe(imagee)
final_caption = caption[0]['generated_text']
return generate_audio(final_caption)
demo = gr.Interface(fn=caption_my_image,
inputs=[gr.Image(label='Upload an image to know the story behind it')],
outputs=[gr.Audio(label='Play the narration of an image')],
title="Here Image narration in real time",
description='This will narrate the description of the image'
)
demo.launch(share='True', debug=True)