Spaces:
Running
Running
Update generate_audio.py
Browse files- generate_audio.py +15 -9
generate_audio.py
CHANGED
|
@@ -38,15 +38,15 @@ class TTSGenerator:
|
|
| 38 |
self.speaker1_description = """
|
| 39 |
Laura's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise.
|
| 40 |
"""
|
| 41 |
-
|
| 42 |
Gary's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise.
|
| 43 |
"""
|
| 44 |
|
| 45 |
# Load Bark model and processor for Speaker 2
|
| 46 |
-
self.bark_processor = AutoProcessor.from_pretrained("suno/bark")
|
| 47 |
-
self.bark_model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to(self.device)
|
| 48 |
-
self.bark_sampling_rate = 24000
|
| 49 |
-
self.voice_preset = "v2/en_speaker_6"
|
| 50 |
|
| 51 |
@spaces.GPU
|
| 52 |
def load_transcript(self):
|
|
@@ -89,10 +89,16 @@ class TTSGenerator:
|
|
| 89 |
np.array: Audio array.
|
| 90 |
int: Sampling rate.
|
| 91 |
"""
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
@staticmethod
|
| 98 |
@spaces.GPU
|
|
|
|
| 38 |
self.speaker1_description = """
|
| 39 |
Laura's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise.
|
| 40 |
"""
|
| 41 |
+
self.speaker2_description = """
|
| 42 |
Gary's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise.
|
| 43 |
"""
|
| 44 |
|
| 45 |
# Load Bark model and processor for Speaker 2
|
| 46 |
+
# self.bark_processor = AutoProcessor.from_pretrained("suno/bark")
|
| 47 |
+
# self.bark_model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to(self.device)
|
| 48 |
+
# self.bark_sampling_rate = 24000
|
| 49 |
+
# self.voice_preset = "v2/en_speaker_6"
|
| 50 |
|
| 51 |
@spaces.GPU
|
| 52 |
def load_transcript(self):
|
|
|
|
| 89 |
np.array: Audio array.
|
| 90 |
int: Sampling rate.
|
| 91 |
"""
|
| 92 |
+
|
| 93 |
+
input_ids = self.parler_tokenizer(self.speaker2_description, return_tensors="pt").input_ids.to(self.device)
|
| 94 |
+
prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
|
| 95 |
+
generation = self.parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
| 96 |
+
audio_arr = generation.cpu().numpy().squeeze()
|
| 97 |
+
return audio_arr, self.parler_model.config.sampling_rate
|
| 98 |
+
# inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
|
| 99 |
+
# speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
|
| 100 |
+
# audio_arr = speech_output[0].cpu().numpy()
|
| 101 |
+
# return audio_arr, self.bark_sampling_rate
|
| 102 |
|
| 103 |
@staticmethod
|
| 104 |
@spaces.GPU
|