Spaces:

Walid-Ahmed
/

Image-Captioning_w_audio

Running

Update app.py

a0c84cf verified 12 months ago

1.39 kB

	import torch
	from transformers import pipeline
	from PIL import Image
	from scipy.io import wavfile
	import gradio as gr
	import numpy as np
	# Specify the device (CPU or GPU)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Load the image-to-text pipeline
	caption_image = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large", device=device)
	# Load the image-to-text pipeline with the vit-gpt2 model
	#caption_pipeline = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device)

	# Load the text-to-speech pipeline
	narrator = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs", device=device)


	def process_image(image):
	# Generate the caption
	caption = caption_image(image)[0]['generated_text']

	# Generate speech from the caption
	speech = narrator(caption)

	# Convert the audio to PCM format
	audio_data = np.array(speech["audio"][0] * 32767, dtype=np.int16)

	# Save the audio to a WAV file
	audio_path = "caption.wav"
	wavfile.write(audio_path, rate=speech["sampling_rate"], data=audio_data)

	return caption, audio_path

	# Create Gradio interface
	iface = gr.Interface(
	fn=process_image,
	inputs=gr.Image(type="pil"),
	outputs=[gr.Textbox(label="Generated Caption"), gr.Audio(label="Generated Audio", type="filepath")]
	)

	# Launch the interface
	iface.launch()