Spaces:

Nepjune
/

Project_image_captioning_blip

Runtime error

App Files Files Community

Nepjune commited on Feb 15, 2024

Commit

6919033

verified ·

1 Parent(s): 059c7dc

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -19

app.py CHANGED Viewed

@@ -1,13 +1,12 @@
-from TTS.api import TTS
-from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
 import torchaudio
 from torchaudio.transforms import Resample
 import torch
 import gradio as gr
-# Initialize TTS model from TTS library
-tts_model_path = "tts_models/multilingual/multi-dataset/xtts_v1"
-tts = TTS(tts_model_path, gpu=True)
 # Initialize Blip model for image captioning
 model_id = "dblasko/blip-dalle3-img2prompt"
@@ -22,20 +21,11 @@ def generate_caption(image):
     generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
     # Use TTS model to convert generated caption to audio
-    tts.tts_to_file(text=generated_caption,
-                    file_path="generated_audio.wav",
-                    speaker_wav="/path/to/target/speaker.wav",
-                    language="en")
-    # Resample the audio to match the expected sampling rate
-    waveform, sample_rate = torchaudio.load("generated_audio.wav")
-    resampler = Resample(orig_freq=sample_rate, new_freq=24_000)
-    waveform_resampled = resampler(waveform)
-    # Save the resampled audio
-    torchaudio.save("generated_audio_resampled.wav", waveform_resampled, 24_000)
-    return generated_caption, "generated_audio_resampled.wav"
 # Create a Gradio interface with an image input, a textbox output, a button, and an audio player
 demo = gr.Interface(
@@ -49,4 +39,3 @@ demo = gr.Interface(
     live=True
 )
 demo.launch(share=True)

+from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 import torchaudio
 from torchaudio.transforms import Resample
 import torch
 import gradio as gr
+# Initialize TTS model from Hugging Face
+tts_model_name = "suno/bark"
+tts = pipeline(task="text-to-speech", model=tts_model_name)
 # Initialize Blip model for image captioning
 model_id = "dblasko/blip-dalle3-img2prompt"
     generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
     # Use TTS model to convert generated caption to audio
+    audio_output = tts(generated_caption)
+    audio_path = "generated_audio_resampled.wav"
+    torchaudio.save(audio_path, torch.tensor(audio_output[0]), audio_output["sampling_rate"])
+    return generated_caption, audio_path
 # Create a Gradio interface with an image input, a textbox output, a button, and an audio player
 demo = gr.Interface(
     live=True
 )
 demo.launch(share=True)