Spaces:
Runtime error
Runtime error
import torch | |
from transformers import pipeline | |
from PIL import Image | |
from scipy.io import wavfile | |
import gradio as gr | |
import numpy as np | |
import os | |
import requests | |
# Specify the device (CPU or GPU) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
# Load the image-to-text pipeline | |
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device) | |
# Load the image-to-text pipeline with the vit-gpt2 model | |
#caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device) | |
# Load the text-to-speech pipeline | |
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device) | |
# List of local image paths | |
example_images = ["image1.jpeg", "image2.jpeg", "image3.jpeg"] | |
def process_image(image): | |
# Generate the caption | |
caption = caption_image(image)[0]['generated_text'] | |
# Generate speech from the caption | |
speech = narrator(caption) | |
# Convert the audio to PCM format | |
audio_data = np.array(speech["audio"][0] * 32767, dtype=np.int16) | |
# Save the audio to a WAV file | |
audio_path = "caption.wav" | |
wavfile.write(audio_path, rate=speech["sampling_rate"], data=audio_data) | |
return caption, audio_path | |
# Create Gradio interface | |
iface = gr.Interface( | |
fn=process_image, | |
inputs=gr.Image(type="pil"), | |
outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(label="Generated Audio", type="filepath")], | |
examples=example_images | |
) | |
# Launch the interface | |
iface.launch() | |