Walid-Ahmed's picture
Update app.py
7f20984 verified
raw
history blame
2.42 kB
import torch
from transformers import pipeline
from PIL import Image
from scipy.io import wavfile
import gradio as gr
import numpy as np
import os
import requests
# Specify the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load the image-to-text pipeline
caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
# Load the image-to-text pipeline with the vit-gpt2 model
#caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device)
# Load the text-to-speech pipeline
narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)
# URLs of the images
image_urls = [
"https://github.com/Walid-Ahmed/ML_Datasets/blob/master/image1.jpeg?raw=true",
"https://github.com/Walid-Ahmed/ML_Datasets/blob/master/image2.jpeg?raw=true",
"https://github.com/Walid-Ahmed/ML_Datasets/blob/master/image3.jpeg?raw=true"
]
# Directory to save images
save_dir = "example_images"
os.makedirs(save_dir, exist_ok=True)
# Function to download images
def download_image(url, filename):
response = requests.get(url)
if response.status_code == 200:
with open(filename, "wb") as f:
f.write(response.content)
return filename
else:
print(f"Failed to download: {url}")
return None
# Download images
example_images = []
for idx, url in enumerate(image_urls):
img_path = os.path.join(save_dir, f"image{idx+1}.jpeg")
if not os.path.exists(img_path): # Avoid redownloading if already exists
download_image(url, img_path)
example_images.append(img_path)
def process_image(image):
# Generate the caption
caption = caption_image(image)[0]['generated_text']
# Generate speech from the caption
speech = narrator(caption)
# Convert the audio to PCM format
audio_data = np.array(speech["audio"][0] * 32767, dtype=np.int16)
# Save the audio to a WAV file
audio_path = "caption.wav"
wavfile.write(audio_path, rate=speech["sampling_rate"], data=audio_data)
return caption, audio_path
# Create Gradio interface
iface = gr.Interface(
fn=process_image,
inputs=gr.Image(type="pil"),
outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(label="Generated Audio", type="filepath")],
examples=example_images
)
# Launch the interface
iface.launch()