Spaces:

Walid-Ahmed
/

Image-Captioning_w_audio

Runtime error

File size: 1,532 Bytes

5eb3adc
 
 
 
 
 
0f7fb57
7f20984
5eb3adc
 
 
 
 
 
 
 
 
 
 
152b1e6
 
7d6946d
5eb3adc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ffa3d6
7d6946d
5eb3adc

import torch
from transformers import pipeline
from PIL import Image
from scipy.io import wavfile
import gradio as gr
import numpy as np
import os
import requests
# Specify the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the image-to-text pipeline
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
# Load the image-to-text pipeline with the vit-gpt2 model
#caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device)

# Load the text-to-speech pipeline
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)

# List of local image paths
example_images = ["image1.jpeg", "image2.jpeg", "image3.jpeg"]

def process_image(image):
    # Generate the caption
    caption = caption_image(image)[0]['generated_text']

    # Generate speech from the caption
    speech = narrator(caption)

    # Convert the audio to PCM format
    audio_data = np.array(speech["audio"][0] * 32767, dtype=np.int16)

    # Save the audio to a WAV file
    audio_path = "caption.wav"
    wavfile.write(audio_path, rate=speech["sampling_rate"], data=audio_data)

    return caption, audio_path

# Create Gradio interface
iface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="pil"),
    outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(label="Generated Audio", type="filepath")],
    examples=example_images
)

# Launch the interface
iface.launch()