Nepjune commited on
Commit
6919033
·
verified ·
1 Parent(s): 059c7dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -19
app.py CHANGED
@@ -1,13 +1,12 @@
1
- from TTS.api import TTS
2
- from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
3
  import torchaudio
4
  from torchaudio.transforms import Resample
5
  import torch
6
  import gradio as gr
7
 
8
- # Initialize TTS model from TTS library
9
- tts_model_path = "tts_models/multilingual/multi-dataset/xtts_v1"
10
- tts = TTS(tts_model_path, gpu=True)
11
 
12
  # Initialize Blip model for image captioning
13
  model_id = "dblasko/blip-dalle3-img2prompt"
@@ -22,20 +21,11 @@ def generate_caption(image):
22
  generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
23
 
24
  # Use TTS model to convert generated caption to audio
25
- tts.tts_to_file(text=generated_caption,
26
- file_path="generated_audio.wav",
27
- speaker_wav="/path/to/target/speaker.wav",
28
- language="en")
29
 
30
- # Resample the audio to match the expected sampling rate
31
- waveform, sample_rate = torchaudio.load("generated_audio.wav")
32
- resampler = Resample(orig_freq=sample_rate, new_freq=24_000)
33
- waveform_resampled = resampler(waveform)
34
-
35
- # Save the resampled audio
36
- torchaudio.save("generated_audio_resampled.wav", waveform_resampled, 24_000)
37
-
38
- return generated_caption, "generated_audio_resampled.wav"
39
 
40
  # Create a Gradio interface with an image input, a textbox output, a button, and an audio player
41
  demo = gr.Interface(
@@ -49,4 +39,3 @@ demo = gr.Interface(
49
  live=True
50
  )
51
  demo.launch(share=True)
52
-
 
1
+ from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
 
2
  import torchaudio
3
  from torchaudio.transforms import Resample
4
  import torch
5
  import gradio as gr
6
 
7
+ # Initialize TTS model from Hugging Face
8
+ tts_model_name = "suno/bark"
9
+ tts = pipeline(task="text-to-speech", model=tts_model_name)
10
 
11
  # Initialize Blip model for image captioning
12
  model_id = "dblasko/blip-dalle3-img2prompt"
 
21
  generated_caption = blip_processor.batch_decode(generated_ids, skip_special_tokens=True, temperature=0.8, top_k=40, top_p=0.9)[0]
22
 
23
  # Use TTS model to convert generated caption to audio
24
+ audio_output = tts(generated_caption)
25
+ audio_path = "generated_audio_resampled.wav"
26
+ torchaudio.save(audio_path, torch.tensor(audio_output[0]), audio_output["sampling_rate"])
 
27
 
28
+ return generated_caption, audio_path
 
 
 
 
 
 
 
 
29
 
30
  # Create a Gradio interface with an image input, a textbox output, a button, and an audio player
31
  demo = gr.Interface(
 
39
  live=True
40
  )
41
  demo.launch(share=True)