yasserrmd commited on
Commit
f8a93c9
·
verified ·
1 Parent(s): a6019e2

Update generate_audio.py

Browse files
Files changed (1) hide show
  1. generate_audio.py +18 -9
generate_audio.py CHANGED
@@ -105,21 +105,30 @@ class TTSGenerator:
105
  int: Sampling rate.
106
  """
107
 
108
- # input_ids = self.parler_tokenizer(self.speaker2_description, return_tensors="pt").input_ids.to(self.device)
109
- # prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
110
- # generation = self.parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
111
- # audio_arr = generation.cpu().numpy().squeeze()
112
- # return audio_arr, self.parler_model.config.sampling_rate
 
 
 
 
 
 
 
 
 
113
 
114
  # inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
115
  # speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
116
  # audio_arr = speech_output[0].cpu().numpy()
117
  # return audio_arr, self.bark_sampling_rate
118
  # Tokenize input text and obtain input IDs and attention mask
119
- inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
120
- speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
121
- audio_arr = speech_output[0].cpu().numpy()
122
- return audio_arr, self.bark_sampling_rate
123
 
124
  @staticmethod
125
  @spaces.GPU
 
105
  int: Sampling rate.
106
  """
107
 
108
+ input_ids = self.parler_tokenizer(self.speaker2_description, return_tensors="pt", padding=True).input_ids.to(self.device)
109
+ attention_mask_input = self.parler_tokenizer(self.speaker1_description, return_tensors="pt", padding=True).attention_mask.to(self.device)
110
+
111
+ prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt", padding=True).input_ids.to(self.device)
112
+ attention_mask_prompt = self.parler_tokenizer(text, return_tensors="pt", padding=True).attention_mask.to(self.device)
113
+
114
+ # Pass all required arguments to generate() for reliable behavior
115
+ generation = self.parler_model.generate(
116
+ input_ids=input_ids,
117
+ attention_mask=attention_mask_input, # Set attention mask for input IDs
118
+ prompt_input_ids=prompt_input_ids,
119
+ prompt_attention_mask=attention_mask_prompt # Set prompt attention mask
120
+ )
121
+ audio_arr = generation.cpu().numpy().squeeze()
122
 
123
  # inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
124
  # speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
125
  # audio_arr = speech_output[0].cpu().numpy()
126
  # return audio_arr, self.bark_sampling_rate
127
  # Tokenize input text and obtain input IDs and attention mask
128
+ # inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
129
+ # speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
130
+ # audio_arr = speech_output[0].cpu().numpy()
131
+ return audio_arr, self.parler_model.config.sampling_rate
132
 
133
  @staticmethod
134
  @spaces.GPU