Nepjune's picture
Update app.py
ff7ab28 verified
raw
history blame
1.38 kB
import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
from playsound import playsound
from concurrent.futures import ThreadPoolExecutor
model_id = "dblasko/blip-dalle3-img2prompt"
model = BlipForConditionalGeneration.from_pretrained(model_id)
processor = BlipProcessor.from_pretrained(model_id)
def generate_caption(image):
# Generate caption from image
inputs = processor(images=image, return_tensors="pt")
pixel_values = inputs.pixel_values
generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
# Convert the generated caption to speech
tts = gTTS(text=generated_caption, lang='en')
audio_path = "generated_audio.mp3"
tts.save(audio_path)
return generated_caption, audio_path
def play_audio(audio_path):
playsound(audio_path)
# Create a Gradio interface with an image input, a textbox output, a button, and an audio player
demo = gr.Interface(
fn=generate_caption,
inputs=gr.Image(),
outputs=[
gr.Textbox(label="Generated caption"),
gr.Button("Convert to Audio", play_audio),
],
live=True # ทำให้ Gradio ทำงานแบบไม่บล็อก
)
demo.launch(share=True)