Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,22 +2,28 @@ import gradio as gr
|
|
2 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
3 |
from gtts import gTTS
|
4 |
from playsound import playsound
|
|
|
5 |
|
|
|
|
|
|
|
|
|
6 |
model_id = "dblasko/blip-dalle3-img2prompt"
|
7 |
model = BlipForConditionalGeneration.from_pretrained(model_id)
|
8 |
processor = BlipProcessor.from_pretrained(model_id)
|
9 |
|
10 |
def generate_caption(image):
|
|
|
11 |
inputs = processor(images=image, return_tensors="pt")
|
12 |
pixel_values = inputs.pixel_values
|
13 |
-
|
14 |
generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
|
15 |
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
|
16 |
|
17 |
-
# Convert
|
18 |
-
|
19 |
-
audio_path = "generated_audio.
|
20 |
-
|
|
|
21 |
|
22 |
return generated_caption, audio_path
|
23 |
|
@@ -33,4 +39,4 @@ demo = gr.Interface(
|
|
33 |
gr.Button("Convert to Audio", play_audio),
|
34 |
]
|
35 |
)
|
36 |
-
demo.launch(share=True)
|
|
|
2 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
3 |
from gtts import gTTS
|
4 |
from playsound import playsound
|
5 |
+
from transformers import pipeline
|
6 |
|
7 |
+
# Load the text-to-speech model
|
8 |
+
tts_synthesizer = pipeline("text-to-speech", "suno/bark")
|
9 |
+
|
10 |
+
# Load the image-to-text model
|
11 |
model_id = "dblasko/blip-dalle3-img2prompt"
|
12 |
model = BlipForConditionalGeneration.from_pretrained(model_id)
|
13 |
processor = BlipProcessor.from_pretrained(model_id)
|
14 |
|
15 |
def generate_caption(image):
|
16 |
+
# Generate caption from image
|
17 |
inputs = processor(images=image, return_tensors="pt")
|
18 |
pixel_values = inputs.pixel_values
|
|
|
19 |
generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
|
20 |
generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
|
21 |
|
22 |
+
# Convert the generated caption to speech
|
23 |
+
speech = tts_synthesizer(generated_caption)
|
24 |
+
audio_path = "generated_audio.wav"
|
25 |
+
with open(audio_path, "wb") as f:
|
26 |
+
f.write(speech["audio"])
|
27 |
|
28 |
return generated_caption, audio_path
|
29 |
|
|
|
39 |
gr.Button("Convert to Audio", play_audio),
|
40 |
]
|
41 |
)
|
42 |
+
demo.launch(share=True)
|