|
import torch |
|
from transformers import pipeline |
|
from PIL import Image |
|
from scipy.io import wavfile |
|
import gradio as gr |
|
import numpy as np |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device) |
|
|
|
|
|
|
|
|
|
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device) |
|
|
|
|
|
def process_image(image): |
|
|
|
caption = caption_image(image)[0]['generated_text'] |
|
|
|
|
|
speech = narrator(caption) |
|
|
|
|
|
audio_data = np.array(speech["audio"][0] * 32767, dtype=np.int16) |
|
|
|
|
|
audio_path = "caption.wav" |
|
wavfile.write(audio_path, rate=speech["sampling_rate"], data=audio_data) |
|
|
|
return caption, audio_path |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_image, |
|
inputs=gr.Image(type="pil"), |
|
outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(label="Generated Audio", type="filepath")] |
|
) |
|
|
|
|
|
iface.launch() |
|
|