import torch from transformers import pipeline from PIL import Image from scipy.io import wavfile import gradio as gr import numpy as np import os import requests # Specify the device (CPU or GPU) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load the image-to-text pipeline caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device) # Load the image-to-text pipeline with the vit-gpt2 model #caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device) # Load the text-to-speech pipeline narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device) # URLs of the images image_urls = [ "https://github.com/Walid-Ahmed/ML_Datasets/blob/master/image1.jpeg?raw=true", "https://github.com/Walid-Ahmed/ML_Datasets/blob/master/image2.jpeg?raw=true", "https://github.com/Walid-Ahmed/ML_Datasets/blob/master/image3.jpeg?raw=true" ] # Directory to save images save_dir = "example_images" os.makedirs(save_dir, exist_ok=True) # Function to download images def download_image(url, filename): response = requests.get(url) if response.status_code == 200: with open(filename, "wb") as f: f.write(response.content) return filename else: print(f"Failed to download: {url}") return None # Download images example_images = [] for idx, url in enumerate(image_urls): img_path = os.path.join(save_dir, f"image{idx+1}.jpeg") if not os.path.exists(img_path): # Avoid redownloading if already exists download_image(url, img_path) example_images.append(img_path) def process_image(image): # Generate the caption caption = caption_image(image)[0]['generated_text'] # Generate speech from the caption speech = narrator(caption) # Convert the audio to PCM format audio_data = np.array(speech["audio"][0] * 32767, dtype=np.int16) # Save the audio to a WAV file audio_path = "caption.wav" wavfile.write(audio_path, rate=speech["sampling_rate"], data=audio_data) return caption, audio_path # Create Gradio interface iface = gr.Interface( fn=process_image, inputs=gr.Image(type="pil"), outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(label="Generated Audio", type="filepath")], examples=example_images ) # Launch the interface iface.launch()