Spaces:

Walid-Ahmed
/

Image-Captioning_w_audio

Runtime error

Update app.py

152b1e6 verified about 1 month ago

1.53 kB

	import torch
	from transformers import pipeline
	from PIL import Image
	from scipy.io import wavfile
	import gradio as gr
	import numpy as np
	import os
	import requests
	# Specify the device (CPU or GPU)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Load the image-to-text pipeline
	caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
	# Load the image-to-text pipeline with the vit-gpt2 model
	#caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device)

	# Load the text-to-speech pipeline
	narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)

	# List of local image paths
	example_images = ["image1.jpeg", "image2.jpeg", "image3.jpeg"]

	def process_image(image):
	# Generate the caption
	caption = caption_image(image)[0]['generated_text']

	# Generate speech from the caption
	speech = narrator(caption)

	# Convert the audio to PCM format
	audio_data = np.array(speech["audio"][0] * 32767, dtype=np.int16)

	# Save the audio to a WAV file
	audio_path = "caption.wav"
	wavfile.write(audio_path, rate=speech["sampling_rate"], data=audio_data)

	return caption, audio_path

	# Create Gradio interface
	iface = gr.Interface(
	fn=process_image,
	inputs=gr.Image(type="pil"),
	outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(label="Generated Audio", type="filepath")],
	examples=example_images
	)

	# Launch the interface
	iface.launch()