Walid-Ahmed's picture
Update app.py
152b1e6 verified
import torch
from transformers import pipeline
from PIL import Image
from scipy.io import wavfile
import gradio as gr
import numpy as np
import os
import requests
# Specify the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the image-to-text pipeline
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
# Load the image-to-text pipeline with the vit-gpt2 model
#caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device)
# Load the text-to-speech pipeline
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)
# List of local image paths
example_images = ["image1.jpeg", "image2.jpeg", "image3.jpeg"]
def process_image(image):
# Generate the caption
caption = caption_image(image)[0]['generated_text']
# Generate speech from the caption
speech = narrator(caption)
# Convert the audio to PCM format
audio_data = np.array(speech["audio"][0] * 32767, dtype=np.int16)
# Save the audio to a WAV file
audio_path = "caption.wav"
wavfile.write(audio_path, rate=speech["sampling_rate"], data=audio_data)
return caption, audio_path
# Create Gradio interface
iface = gr.Interface(
fn=process_image,
inputs=gr.Image(type="pil"),
outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(label="Generated Audio", type="filepath")],
examples=example_images
)
# Launch the interface
iface.launch()