Spaces:

Nepjune
/

Project_image_captioning_blip

Runtime error

Nepjune commited on Feb 15, 2024

Commit

97599f2

verified ·

1 Parent(s): c8345b4

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,22 +2,28 @@ import gradio as gr
 from transformers import BlipProcessor, BlipForConditionalGeneration
 from gtts import gTTS
 from playsound import playsound
 model_id = "dblasko/blip-dalle3-img2prompt"
 model = BlipForConditionalGeneration.from_pretrained(model_id)
 processor = BlipProcessor.from_pretrained(model_id)
 def generate_caption(image):
     inputs = processor(images=image, return_tensors="pt")
     pixel_values = inputs.pixel_values
     generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
     generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
-    # Convert text to speech and save as audio file
-    tts = gTTS(text=generated_caption, lang='en')
-    audio_path = "generated_audio.mp3"
-    tts.save(audio_path)
     return generated_caption, audio_path
@@ -33,4 +39,4 @@ demo = gr.Interface(
         gr.Button("Convert to Audio", play_audio),
     ]
 )
-demo.launch(share=True)

 from transformers import BlipProcessor, BlipForConditionalGeneration
 from gtts import gTTS
 from playsound import playsound
+from transformers import pipeline
+# Load the text-to-speech model
+tts_synthesizer = pipeline("text-to-speech", "suno/bark")
+# Load the image-to-text model
 model_id = "dblasko/blip-dalle3-img2prompt"
 model = BlipForConditionalGeneration.from_pretrained(model_id)
 processor = BlipProcessor.from_pretrained(model_id)
 def generate_caption(image):
+    # Generate caption from image
     inputs = processor(images=image, return_tensors="pt")
     pixel_values = inputs.pixel_values
     generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
     generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
+    # Convert the generated caption to speech
+    speech = tts_synthesizer(generated_caption)
+    audio_path = "generated_audio.wav"
+    with open(audio_path, "wb") as f:
+        f.write(speech["audio"])
     return generated_caption, audio_path
         gr.Button("Convert to Audio", play_audio),
     ]
 )
+demo.launch(share=True)