|
import torch |
|
from transformers import pipeline |
|
from PIL import Image |
|
from scipy.io import wavfile |
|
import gradio as gr |
|
import numpy as np |
|
import os |
|
import requests |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device) |
|
|
|
|
|
|
|
|
|
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device) |
|
|
|
|
|
|
|
image_urls = [ |
|
"https://github.com/Walid-Ahmed/ML_Datasets/blob/master/image1.jpeg?raw=true", |
|
"https://github.com/Walid-Ahmed/ML_Datasets/blob/master/image2.jpeg?raw=true", |
|
"https://github.com/Walid-Ahmed/ML_Datasets/blob/master/image3.jpeg?raw=true" |
|
] |
|
|
|
|
|
save_dir = "example_images" |
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
|
|
def download_image(url, filename): |
|
response = requests.get(url) |
|
if response.status_code == 200: |
|
with open(filename, "wb") as f: |
|
f.write(response.content) |
|
return filename |
|
else: |
|
print(f"Failed to download: {url}") |
|
return None |
|
|
|
|
|
example_images = [] |
|
for idx, url in enumerate(image_urls): |
|
img_path = os.path.join(save_dir, f"image{idx+1}.jpeg") |
|
if not os.path.exists(img_path): |
|
download_image(url, img_path) |
|
example_images.append(img_path) |
|
|
|
def process_image(image): |
|
|
|
caption = caption_image(image)[0]['generated_text'] |
|
|
|
|
|
speech = narrator(caption) |
|
|
|
|
|
audio_data = np.array(speech["audio"][0] * 32767, dtype=np.int16) |
|
|
|
|
|
audio_path = "caption.wav" |
|
wavfile.write(audio_path, rate=speech["sampling_rate"], data=audio_data) |
|
|
|
return caption, audio_path |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_image, |
|
inputs=gr.Image(type="pil"), |
|
outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(label="Generated Audio", type="filepath")], |
|
examples=example_images |
|
) |
|
|
|
|
|
iface.launch() |
|
|