| 
							 | 
						import torch | 
					
					
						
						| 
							 | 
						from transformers import pipeline | 
					
					
						
						| 
							 | 
						from PIL import Image | 
					
					
						
						| 
							 | 
						from scipy.io import wavfile | 
					
					
						
						| 
							 | 
						import gradio as gr | 
					
					
						
						| 
							 | 
						import numpy as np | 
					
					
						
						| 
							 | 
						import os | 
					
					
						
						| 
							 | 
						import requests | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device) | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						example_images = ["image1.jpeg", "image2.jpeg", "image3.jpeg"] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						def process_image(image): | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    caption = caption_image(image)[0]['generated_text'] | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    speech = narrator(caption) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    audio_data = np.array(speech["audio"][0] * 32767, dtype=np.int16) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						     | 
					
					
						
						| 
							 | 
						    audio_path = "caption.wav" | 
					
					
						
						| 
							 | 
						    wavfile.write(audio_path, rate=speech["sampling_rate"], data=audio_data) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						    return caption, audio_path | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						iface = gr.Interface( | 
					
					
						
						| 
							 | 
						    fn=process_image, | 
					
					
						
						| 
							 | 
						    inputs=gr.Image(type="pil"), | 
					
					
						
						| 
							 | 
						    outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(label="Generated Audio", type="filepath")], | 
					
					
						
						| 
							 | 
						    examples=example_images | 
					
					
						
						| 
							 | 
						) | 
					
					
						
						| 
							 | 
						
 | 
					
					
						
						| 
							 | 
						 | 
					
					
						
						| 
							 | 
						iface.launch() | 
					
					
						
						| 
							 | 
						
 |