Spaces:
Running
Running
Update generate_audio.py
Browse files- generate_audio.py +31 -12
generate_audio.py
CHANGED
@@ -43,10 +43,10 @@ class TTSGenerator:
|
|
43 |
"""
|
44 |
|
45 |
# Load Bark model and processor for Speaker 2
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
|
51 |
@spaces.GPU
|
52 |
def load_transcript(self):
|
@@ -82,12 +82,12 @@ class TTSGenerator:
|
|
82 |
prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt", padding=True).input_ids.to(self.device)
|
83 |
attention_mask_prompt = self.parler_tokenizer(text, return_tensors="pt", padding=True).attention_mask.to(self.device)
|
84 |
|
85 |
-
#
|
86 |
generation = self.parler_model.generate(
|
87 |
input_ids=input_ids,
|
88 |
-
attention_mask=attention_mask_input,
|
89 |
prompt_input_ids=prompt_input_ids,
|
90 |
-
prompt_attention_mask=attention_mask_prompt
|
91 |
)
|
92 |
audio_arr = generation.cpu().numpy().squeeze()
|
93 |
return audio_arr, self.parler_model.config.sampling_rate
|
@@ -105,15 +105,34 @@ class TTSGenerator:
|
|
105 |
int: Sampling rate.
|
106 |
"""
|
107 |
|
108 |
-
input_ids = self.parler_tokenizer(self.speaker2_description, return_tensors="pt").input_ids.to(self.device)
|
109 |
-
prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
|
110 |
-
generation = self.parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
111 |
-
audio_arr = generation.cpu().numpy().squeeze()
|
112 |
-
return audio_arr, self.parler_model.config.sampling_rate
|
|
|
113 |
# inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
|
114 |
# speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
|
115 |
# audio_arr = speech_output[0].cpu().numpy()
|
116 |
# return audio_arr, self.bark_sampling_rate
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
@staticmethod
|
119 |
@spaces.GPU
|
|
|
43 |
"""
|
44 |
|
45 |
# Load Bark model and processor for Speaker 2
|
46 |
+
self.bark_processor = AutoProcessor.from_pretrained("suno/bark")
|
47 |
+
self.bark_model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to(self.device)
|
48 |
+
self.bark_sampling_rate = 24000
|
49 |
+
self.voice_preset = "v2/en_speaker_6"
|
50 |
|
51 |
@spaces.GPU
|
52 |
def load_transcript(self):
|
|
|
82 |
prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt", padding=True).input_ids.to(self.device)
|
83 |
attention_mask_prompt = self.parler_tokenizer(text, return_tensors="pt", padding=True).attention_mask.to(self.device)
|
84 |
|
85 |
+
# Pass all required arguments to generate() for reliable behavior
|
86 |
generation = self.parler_model.generate(
|
87 |
input_ids=input_ids,
|
88 |
+
attention_mask=attention_mask_input, # Set attention mask for input IDs
|
89 |
prompt_input_ids=prompt_input_ids,
|
90 |
+
prompt_attention_mask=attention_mask_prompt # Set prompt attention mask
|
91 |
)
|
92 |
audio_arr = generation.cpu().numpy().squeeze()
|
93 |
return audio_arr, self.parler_model.config.sampling_rate
|
|
|
105 |
int: Sampling rate.
|
106 |
"""
|
107 |
|
108 |
+
# input_ids = self.parler_tokenizer(self.speaker2_description, return_tensors="pt").input_ids.to(self.device)
|
109 |
+
# prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
|
110 |
+
# generation = self.parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
111 |
+
# audio_arr = generation.cpu().numpy().squeeze()
|
112 |
+
# return audio_arr, self.parler_model.config.sampling_rate
|
113 |
+
|
114 |
# inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
|
115 |
# speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
|
116 |
# audio_arr = speech_output[0].cpu().numpy()
|
117 |
# return audio_arr, self.bark_sampling_rate
|
118 |
+
# Tokenize input text and obtain input IDs and attention mask
|
119 |
+
inputs = self.bark_processor(text, voice_preset=self.voice_preset, return_tensors="pt", padding=True).to(self.device)
|
120 |
+
input_ids = inputs.input_ids.to(self.device)
|
121 |
+
attention_mask = inputs.attention_mask.to(self.device)
|
122 |
+
|
123 |
+
# Generate speech output with both input IDs and attention mask
|
124 |
+
speech_output = self.bark_model.generate(
|
125 |
+
input_ids=input_ids,
|
126 |
+
attention_mask=attention_mask,
|
127 |
+
temperature=0.9,
|
128 |
+
semantic_temperature=0.8
|
129 |
+
)
|
130 |
+
# Convert the generated audio to numpy array
|
131 |
+
audio_arr = speech_output[0].cpu().numpy()
|
132 |
+
return audio_arr, self.bark_sampling_rate
|
133 |
+
|
134 |
+
# Convert the generated audio to numpy array
|
135 |
+
audio_arr = speech_output[0].cpu().numpy()
|
136 |
|
137 |
@staticmethod
|
138 |
@spaces.GPU
|