yasserrmd commited on
Commit
4a46ced
·
verified ·
1 Parent(s): 43660e3

Update generate_audio.py

Browse files
Files changed (1) hide show
  1. generate_audio.py +15 -9
generate_audio.py CHANGED
@@ -38,15 +38,15 @@ class TTSGenerator:
38
  self.speaker1_description = """
39
  Laura's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise.
40
  """
41
- self.speaker2_description = """
42
  Gary's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise.
43
  """
44
 
45
  # Load Bark model and processor for Speaker 2
46
- self.bark_processor = AutoProcessor.from_pretrained("suno/bark")
47
- self.bark_model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to(self.device)
48
- self.bark_sampling_rate = 24000
49
- self.voice_preset = "v2/en_speaker_6"
50
 
51
  @spaces.GPU
52
  def load_transcript(self):
@@ -89,10 +89,16 @@ class TTSGenerator:
89
  np.array: Audio array.
90
  int: Sampling rate.
91
  """
92
- inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
93
- speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
94
- audio_arr = speech_output[0].cpu().numpy()
95
- return audio_arr, self.bark_sampling_rate
 
 
 
 
 
 
96
 
97
  @staticmethod
98
  @spaces.GPU
 
38
  self.speaker1_description = """
39
  Laura's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise.
40
  """
41
+ self.speaker2_description = """
42
  Gary's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise.
43
  """
44
 
45
  # Load Bark model and processor for Speaker 2
46
+ # self.bark_processor = AutoProcessor.from_pretrained("suno/bark")
47
+ # self.bark_model = BarkModel.from_pretrained("suno/bark", torch_dtype=torch.float16).to(self.device)
48
+ # self.bark_sampling_rate = 24000
49
+ # self.voice_preset = "v2/en_speaker_6"
50
 
51
  @spaces.GPU
52
  def load_transcript(self):
 
89
  np.array: Audio array.
90
  int: Sampling rate.
91
  """
92
+
93
+ input_ids = self.parler_tokenizer(self.speaker2_description, return_tensors="pt").input_ids.to(self.device)
94
+ prompt_input_ids = self.parler_tokenizer(text, return_tensors="pt").input_ids.to(self.device)
95
+ generation = self.parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
96
+ audio_arr = generation.cpu().numpy().squeeze()
97
+ return audio_arr, self.parler_model.config.sampling_rate
98
+ # inputs = self.bark_processor(text, voice_preset=self.voice_preset).to(self.device)
99
+ # speech_output = self.bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)
100
+ # audio_arr = speech_output[0].cpu().numpy()
101
+ # return audio_arr, self.bark_sampling_rate
102
 
103
  @staticmethod
104
  @spaces.GPU