Nepjune commited on
Commit
97599f2
·
verified ·
1 Parent(s): c8345b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -6
app.py CHANGED
@@ -2,22 +2,28 @@ import gradio as gr
2
  from transformers import BlipProcessor, BlipForConditionalGeneration
3
  from gtts import gTTS
4
  from playsound import playsound
 
5
 
 
 
 
 
6
  model_id = "dblasko/blip-dalle3-img2prompt"
7
  model = BlipForConditionalGeneration.from_pretrained(model_id)
8
  processor = BlipProcessor.from_pretrained(model_id)
9
 
10
  def generate_caption(image):
 
11
  inputs = processor(images=image, return_tensors="pt")
12
  pixel_values = inputs.pixel_values
13
-
14
  generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
15
  generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
16
 
17
- # Convert text to speech and save as audio file
18
- tts = gTTS(text=generated_caption, lang='en')
19
- audio_path = "generated_audio.mp3"
20
- tts.save(audio_path)
 
21
 
22
  return generated_caption, audio_path
23
 
@@ -33,4 +39,4 @@ demo = gr.Interface(
33
  gr.Button("Convert to Audio", play_audio),
34
  ]
35
  )
36
- demo.launch(share=True)
 
2
  from transformers import BlipProcessor, BlipForConditionalGeneration
3
  from gtts import gTTS
4
  from playsound import playsound
5
+ from transformers import pipeline
6
 
7
+ # Load the text-to-speech model
8
+ tts_synthesizer = pipeline("text-to-speech", "suno/bark")
9
+
10
+ # Load the image-to-text model
11
  model_id = "dblasko/blip-dalle3-img2prompt"
12
  model = BlipForConditionalGeneration.from_pretrained(model_id)
13
  processor = BlipProcessor.from_pretrained(model_id)
14
 
15
  def generate_caption(image):
16
+ # Generate caption from image
17
  inputs = processor(images=image, return_tensors="pt")
18
  pixel_values = inputs.pixel_values
 
19
  generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
20
  generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
21
 
22
+ # Convert the generated caption to speech
23
+ speech = tts_synthesizer(generated_caption)
24
+ audio_path = "generated_audio.wav"
25
+ with open(audio_path, "wb") as f:
26
+ f.write(speech["audio"])
27
 
28
  return generated_caption, audio_path
29
 
 
39
  gr.Button("Convert to Audio", play_audio),
40
  ]
41
  )
42
+ demo.launch(share=True)