Spaces:

Walid-Ahmed
/

Image-Captioning_w_audio

Running

App Files Files Community

Image-Captioning_w_audio / app.py

Walid-Ahmed

Update app.py

7f20984 verified 5 months ago

raw

history blame

2.42 kB

	import torch
	from transformers import pipeline
	from PIL import Image
	from scipy.io import wavfile
	import gradio as gr
	import numpy as np
	import os
	import requests
	# Specify the device (CPU or GPU)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Load the image-to-text pipeline
	caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
	# Load the image-to-text pipeline with the vit-gpt2 model
	#caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device)

	# Load the text-to-speech pipeline
	narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)


	# URLs of the images
	image_urls = [
	"https://github.com/Walid-Ahmed/ML_Datasets/blob/master/image1.jpeg?raw=true",
	"https://github.com/Walid-Ahmed/ML_Datasets/blob/master/image2.jpeg?raw=true",
	"https://github.com/Walid-Ahmed/ML_Datasets/blob/master/image3.jpeg?raw=true"
	]

	# Directory to save images
	save_dir = "example_images"
	os.makedirs(save_dir, exist_ok=True)

	# Function to download images
	def download_image(url, filename):
	response = requests.get(url)
	if response.status_code == 200:
	with open(filename, "wb") as f:
	f.write(response.content)
	return filename
	else:
	print(f"Failed to download: {url}")
	return None

	# Download images
	example_images = []
	for idx, url in enumerate(image_urls):
	img_path = os.path.join(save_dir, f"image{idx+1}.jpeg")
	if not os.path.exists(img_path): # Avoid redownloading if already exists
	download_image(url, img_path)
	example_images.append(img_path)

	def process_image(image):
	# Generate the caption
	caption = caption_image(image)[0]['generated_text']

	# Generate speech from the caption
	speech = narrator(caption)

	# Convert the audio to PCM format
	audio_data = np.array(speech["audio"][0] * 32767, dtype=np.int16)

	# Save the audio to a WAV file
	audio_path = "caption.wav"
	wavfile.write(audio_path, rate=speech["sampling_rate"], data=audio_data)

	return caption, audio_path

	# Create Gradio interface
	iface = gr.Interface(
	fn=process_image,
	inputs=gr.Image(type="pil"),
	outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(label="Generated Audio", type="filepath")],
	examples=example_images
	)

	# Launch the interface
	iface.launch()