Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,13 +1,12 @@
|
|
1 |
-
from
|
2 |
-
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
3 |
import torchaudio
|
4 |
from torchaudio.transforms import Resample
|
5 |
import torch
|
6 |
import gradio as gr
|
7 |
|
8 |
-
# Initialize TTS model from
|
9 |
-
|
10 |
-
tts =
|
11 |
|
12 |
# Initialize Blip model for image captioning
|
13 |
model_id = "dblasko/blip-dalle3-img2prompt"
|
@@ -22,20 +21,11 @@ def generate_caption(image):
|
|
22 |
generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
|
23 |
|
24 |
# Use TTS model to convert generated caption to audio
|
25 |
-
tts
|
26 |
-
|
27 |
-
|
28 |
-
language="en")
|
29 |
|
30 |
-
|
31 |
-
waveform, sample_rate = torchaudio.load("generated_audio.wav")
|
32 |
-
resampler = Resample(orig_freq=sample_rate, new_freq=24_000)
|
33 |
-
waveform_resampled = resampler(waveform)
|
34 |
-
|
35 |
-
# Save the resampled audio
|
36 |
-
torchaudio.save("generated_audio_resampled.wav", waveform_resampled, 24_000)
|
37 |
-
|
38 |
-
return generated_caption, "generated_audio_resampled.wav"
|
39 |
|
40 |
# Create a Gradio interface with an image input, a textbox output, a button, and an audio player
|
41 |
demo = gr.Interface(
|
@@ -49,4 +39,3 @@ demo = gr.Interface(
|
|
49 |
live=True
|
50 |
)
|
51 |
demo.launch(share=True)
|
52 |
-
|
|
|
1 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
|
|
|
2 |
import torchaudio
|
3 |
from torchaudio.transforms import Resample
|
4 |
import torch
|
5 |
import gradio as gr
|
6 |
|
7 |
+
# Initialize TTS model from Hugging Face
|
8 |
+
tts_model_name = "suno/bark"
|
9 |
+
tts = pipeline(task="text-to-speech", model=tts_model_name)
|
10 |
|
11 |
# Initialize Blip model for image captioning
|
12 |
model_id = "dblasko/blip-dalle3-img2prompt"
|
|
|
21 |
generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
|
22 |
|
23 |
# Use TTS model to convert generated caption to audio
|
24 |
+
audio_output = tts(generated_caption)
|
25 |
+
audio_path = "generated_audio_resampled.wav"
|
26 |
+
torchaudio.save(audio_path, torch.tensor(audio_output[0]), audio_output["sampling_rate"])
|
|
|
27 |
|
28 |
+
return generated_caption, audio_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
# Create a Gradio interface with an image input, a textbox output, a button, and an audio player
|
31 |
demo = gr.Interface(
|
|
|
39 |
live=True
|
40 |
)
|
41 |
demo.launch(share=True)
|
|