Walid-Ahmed's picture
Update app.py
a0c84cf verified
raw
history blame
1.39 kB
import torch
from transformers import pipeline
from PIL import Image
from scipy.io import wavfile
import gradio as gr
import numpy as np
# Specify the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the image-to-text pipeline
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
# Load the image-to-text pipeline with the vit-gpt2 model
#caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device)
# Load the text-to-speech pipeline
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)
def process_image(image):
# Generate the caption
caption = caption_image(image)[0]['generated_text']
# Generate speech from the caption
speech = narrator(caption)
# Convert the audio to PCM format
audio_data = np.array(speech["audio"][0] * 32767, dtype=np.int16)
# Save the audio to a WAV file
audio_path = "caption.wav"
wavfile.write(audio_path, rate=speech["sampling_rate"], data=audio_data)
return caption, audio_path
# Create Gradio interface
iface = gr.Interface(
fn=process_image,
inputs=gr.Image(type="pil"),
outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(label="Generated Audio", type="filepath")]
)
# Launch the interface
iface.launch()