Spaces:

Nepjune
/

Project_image_captioning_blip

Runtime error

File size: 1,173 Bytes

fe24d04
 
bf2de89
 
6b4a9a6
fe24d04
 
 
6d97bc1
fe24d04
bf2de89
 
b56d4a5
bf2de89
 
b56d4a5
bf2de89
 
 
b56d4a5
bf2de89
 
 
 
 
 
 
 
084403c

import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
import IPython.display as ipd

model_id = "dblasko/blip-dalle3-img2prompt"
model = BlipForConditionalGeneration.from_pretrained(model_id)
processor = BlipProcessor.from_pretrained(model_id)

def generate_caption(image):
    inputs = processor(images=image, return_tensors="pt")
    pixel_values = inputs.pixel_values

    generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]

    # Convert text to speech and save as audio file
    tts = gTTS(text=generated_caption, lang='en')
    tts.save("generated_audio.mp3")

    return generated_caption, "generated_audio.mp3"

def play_audio(audio_path):
    # Display an audio player
    return ipd.Audio(audio_path)

# Create a Gradio interface with an image input, a textbox output, and an audio player
demo = gr.Interface(fn=generate_caption, inputs=gr.Image(), outputs=[gr.Textbox(label="Generated caption"), gr.Audio(player=True, label="Play Audio")])
demo.launch()