yasserrmd commited on
Commit
8eaa6b8
·
verified ·
1 Parent(s): 2cc9d7b

Update generate_audio.py

Browse files
Files changed (1) hide show
  1. generate_audio.py +31 -12
generate_audio.py CHANGED
@@ -43,10 +43,10 @@ class TTSGenerator:
43
  """
44
 
45
  # Load Bark model and processor for Speaker 2
46
- # self.bark_processor = AutoProcessor.from_pretrained("suno/bark")
47
- # self.bark_model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to(self.device)
48
- # self.bark_sampling_rate = 24000
49
- # self.voice_preset = "v2/en_speaker_6"
50
 
51
  @spaces.GPU
52
  def load_transcript(self):
@@ -82,12 +82,12 @@ class TTSGenerator:
82
  prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt", padding=True).input_ids.to(self.device)
83
  attention_mask_prompt = self.parler_tokenizer(text, return_tensors="pt", padding=True).attention_mask.to(self.device)
84
 
85
- # Generate audio with input IDs and attention masks
86
  generation = self.parler_model.generate(
87
  input_ids=input_ids,
88
- attention_mask=attention_mask_input,
89
  prompt_input_ids=prompt_input_ids,
90
- prompt_attention_mask=attention_mask_prompt
91
  )
92
  audio_arr = generation.cpu().numpy().squeeze()
93
  return audio_arr, self.parler_model.config.sampling_rate
@@ -105,15 +105,34 @@ class TTSGenerator:
105
  int: Sampling rate.
106
  """
107
 
108
- input_ids = self.parler_tokenizer(self.speaker2_description, return_tensors="pt").input_ids.to(self.device)
109
- prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
110
- generation = self.parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
111
- audio_arr = generation.cpu().numpy().squeeze()
112
- return audio_arr, self.parler_model.config.sampling_rate
 
113
  # inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
114
  # speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
115
  # audio_arr = speech_output[0].cpu().numpy()
116
  # return audio_arr, self.bark_sampling_rate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  @staticmethod
119
  @spaces.GPU
 
43
  """
44
 
45
  # Load Bark model and processor for Speaker 2
46
+ self.bark_processor = AutoProcessor.from_pretrained("suno/bark")
47
+ self.bark_model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to(self.device)
48
+ self.bark_sampling_rate = 24000
49
+ self.voice_preset = "v2/en_speaker_6"
50
 
51
  @spaces.GPU
52
  def load_transcript(self):
 
82
  prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt", padding=True).input_ids.to(self.device)
83
  attention_mask_prompt = self.parler_tokenizer(text, return_tensors="pt", padding=True).attention_mask.to(self.device)
84
 
85
+ # Pass all required arguments to generate() for reliable behavior
86
  generation = self.parler_model.generate(
87
  input_ids=input_ids,
88
+ attention_mask=attention_mask_input, # Set attention mask for input IDs
89
  prompt_input_ids=prompt_input_ids,
90
+ prompt_attention_mask=attention_mask_prompt # Set prompt attention mask
91
  )
92
  audio_arr = generation.cpu().numpy().squeeze()
93
  return audio_arr, self.parler_model.config.sampling_rate
 
105
  int: Sampling rate.
106
  """
107
 
108
+ # input_ids = self.parler_tokenizer(self.speaker2_description, return_tensors="pt").input_ids.to(self.device)
109
+ # prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
110
+ # generation = self.parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
111
+ # audio_arr = generation.cpu().numpy().squeeze()
112
+ # return audio_arr, self.parler_model.config.sampling_rate
113
+
114
  # inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
115
  # speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
116
  # audio_arr = speech_output[0].cpu().numpy()
117
  # return audio_arr, self.bark_sampling_rate
118
+ # Tokenize input text and obtain input IDs and attention mask
119
+ inputs = self.bark_processor(text, voice_preset=self.voice_preset, return_tensors="pt", padding=True).to(self.device)
120
+ input_ids = inputs.input_ids.to(self.device)
121
+ attention_mask = inputs.attention_mask.to(self.device)
122
+
123
+ # Generate speech output with both input IDs and attention mask
124
+ speech_output = self.bark_model.generate(
125
+ input_ids=input_ids,
126
+ attention_mask=attention_mask,
127
+ temperature=0.9,
128
+ semantic_temperature=0.8
129
+ )
130
+ # Convert the generated audio to numpy array
131
+ audio_arr = speech_output[0].cpu().numpy()
132
+ return audio_arr, self.bark_sampling_rate
133
+
134
+ # Convert the generated audio to numpy array
135
+ audio_arr = speech_output[0].cpu().numpy()
136
 
137
  @staticmethod
138
  @spaces.GPU