Nepjune commited on
Commit
6262d5a
·
verified ·
1 Parent(s): 584b8da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -10
app.py CHANGED
@@ -1,13 +1,15 @@
1
  import gradio as gr
2
  from transformers import BlipProcessor, BlipForConditionalGeneration
3
- from gtts import gTTS
4
- from playsound import playsound
5
  from concurrent.futures import ThreadPoolExecutor
 
6
 
7
  model_id = "dblasko/blip-dalle3-img2prompt"
8
  model = BlipForConditionalGeneration.from_pretrained(model_id)
9
  processor = BlipProcessor.from_pretrained(model_id)
10
 
 
 
 
11
  def generate_caption(image):
12
  # Generate caption from image
13
  inputs = processor(images=image, return_tensors="pt")
@@ -16,14 +18,10 @@ def generate_caption(image):
16
  generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
17
 
18
  # Convert the generated caption to speech
19
- tts = gTTS(text=generated_caption, lang='en')
20
- audio_path = "generated_audio.mp3"
21
- tts.save(audio_path)
22
-
23
- return generated_caption, audio_path
24
 
25
- def play_audio(audio_path):
26
- playsound(audio_path)
27
 
28
  # Create a Gradio interface with an image input, a textbox output, a button, and an audio player
29
  demo = gr.Interface(
@@ -31,7 +29,7 @@ demo = gr.Interface(
31
  inputs=gr.Image(),
32
  outputs=[
33
  gr.Textbox(label="Generated caption"),
34
- gr.Button("Convert to Audio", play_audio),
35
  ],
36
  live=True # ทำให้ Gradio ทำงานแบบไม่บล็อก
37
  )
 
1
  import gradio as gr
2
  from transformers import BlipProcessor, BlipForConditionalGeneration
 
 
3
  from concurrent.futures import ThreadPoolExecutor
4
+ import pyttsx3
5
 
6
  model_id = "dblasko/blip-dalle3-img2prompt"
7
  model = BlipForConditionalGeneration.from_pretrained(model_id)
8
  processor = BlipProcessor.from_pretrained(model_id)
9
 
10
+ # Initialize Text-to-Speech engine
11
+ tts_engine = pyttsx3.init()
12
+
13
  def generate_caption(image):
14
  # Generate caption from image
15
  inputs = processor(images=image, return_tensors="pt")
 
18
  generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
19
 
20
  # Convert the generated caption to speech
21
+ tts_engine.save_to_file(generated_caption, "generated_audio.mp3")
22
+ tts_engine.runAndWait()
 
 
 
23
 
24
+ return generated_caption, "generated_audio.mp3"
 
25
 
26
  # Create a Gradio interface with an image input, a textbox output, a button, and an audio player
27
  demo = gr.Interface(
 
29
  inputs=gr.Image(),
30
  outputs=[
31
  gr.Textbox(label="Generated caption"),
32
+ gr.Button("Convert to Audio", None),
33
  ],
34
  live=True # ทำให้ Gradio ทำงานแบบไม่บล็อก
35
  )