Spaces:

Nepjune
/

Project_image_captioning_blip

Runtime error

File size: 1,382 Bytes

8fe5718
fe24d04
79954be
1961266
ff7ab28
6b4a9a6
fe24d04
 
 
6d97bc1
fe24d04
97599f2
5fe2fff
 
 
 
 
97599f2
ff7ab28
 
 
5fe2fff
 
 
 
1961266
5fe2fff
195e4ea
5fe2fff
 
 
 
 
3c7b357
ff7ab28
 
5fe2fff
97599f2

import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
from playsound import playsound
from concurrent.futures import ThreadPoolExecutor

model_id = "dblasko/blip-dalle3-img2prompt"
model = BlipForConditionalGeneration.from_pretrained(model_id)
processor = BlipProcessor.from_pretrained(model_id)

def generate_caption(image):
    # Generate caption from image
    inputs = processor(images=image, return_tensors="pt")
    pixel_values = inputs.pixel_values
    generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]

    # Convert the generated caption to speech
    tts = gTTS(text=generated_caption, lang='en')
    audio_path = "generated_audio.mp3"
    tts.save(audio_path)

    return generated_caption, audio_path

def play_audio(audio_path):
    playsound(audio_path)

# Create a Gradio interface with an image input, a textbox output, a button, and an audio player
demo = gr.Interface(
    fn=generate_caption, 
    inputs=gr.Image(), 
    outputs=[
        gr.Textbox(label="Generated caption"), 
        gr.Button("Convert to Audio", play_audio),
    ],
    live=True  # ทำให้ Gradio ทำงานแบบไม่บล็อก
)
demo.launch(share=True)