import torch from transformers import pipeline from PIL import Image from scipy.io import wavfile import gradio as gr import numpy as np import os import requests # Specify the device (CPU or GPU) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Load the image-to-text pipeline caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device) # Load the image-to-text pipeline with the vit-gpt2 model #caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device) # Load the text-to-speech pipeline narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device) # List of local image paths example_images = ["image1.jpeg", "image2.jpeg", "image3.jpeg"] def process_image(image): # Generate the caption caption = caption_image(image)[0]['generated_text'] # Generate speech from the caption speech = narrator(caption) # Convert the audio to PCM format audio_data = np.array(speech["audio"][0] * 32767, dtype=np.int16) # Save the audio to a WAV file audio_path = "caption.wav" wavfile.write(audio_path, rate=speech["sampling_rate"], data=audio_data) return caption, audio_path # Create Gradio interface iface = gr.Interface( fn=process_image, inputs=gr.Image(type="pil"), outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(label="Generated Audio", type="filepath")], examples=example_images ) # Launch the interface iface.launch()