Spaces:
Running
Running
Update generate_audio.py
Browse files- generate_audio.py +18 -9
generate_audio.py
CHANGED
@@ -105,21 +105,30 @@ class TTSGenerator:
|
|
105 |
int: Sampling rate.
|
106 |
"""
|
107 |
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
# inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
|
115 |
# speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
|
116 |
# audio_arr = speech_output[0].cpu().numpy()
|
117 |
# return audio_arr, self.bark_sampling_rate
|
118 |
# Tokenize input text and obtain input IDs and attention mask
|
119 |
-
inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
|
120 |
-
speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
|
121 |
-
audio_arr = speech_output[0].cpu().numpy()
|
122 |
-
return audio_arr, self.
|
123 |
|
124 |
@staticmethod
|
125 |
@spaces.GPU
|
|
|
105 |
int: Sampling rate.
|
106 |
"""
|
107 |
|
108 |
+
input_ids = self.parler_tokenizer(self.speaker2_description, return_tensors="pt", padding=True).input_ids.to(self.device)
|
109 |
+
attention_mask_input = self.parler_tokenizer(self.speaker1_description, return_tensors="pt", padding=True).attention_mask.to(self.device)
|
110 |
+
|
111 |
+
prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt", padding=True).input_ids.to(self.device)
|
112 |
+
attention_mask_prompt = self.parler_tokenizer(text, return_tensors="pt", padding=True).attention_mask.to(self.device)
|
113 |
+
|
114 |
+
# Pass all required arguments to generate() for reliable behavior
|
115 |
+
generation = self.parler_model.generate(
|
116 |
+
input_ids=input_ids,
|
117 |
+
attention_mask=attention_mask_input, # Set attention mask for input IDs
|
118 |
+
prompt_input_ids=prompt_input_ids,
|
119 |
+
prompt_attention_mask=attention_mask_prompt # Set prompt attention mask
|
120 |
+
)
|
121 |
+
audio_arr = generation.cpu().numpy().squeeze()
|
122 |
|
123 |
# inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
|
124 |
# speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
|
125 |
# audio_arr = speech_output[0].cpu().numpy()
|
126 |
# return audio_arr, self.bark_sampling_rate
|
127 |
# Tokenize input text and obtain input IDs and attention mask
|
128 |
+
# inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
|
129 |
+
# speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
|
130 |
+
# audio_arr = speech_output[0].cpu().numpy()
|
131 |
+
return audio_arr, self.parler_model.config.sampling_rate
|
132 |
|
133 |
@staticmethod
|
134 |
@spaces.GPU
|