Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,8 @@ import numpy as np
|
|
4 |
import torch
|
5 |
import os
|
6 |
import torch
|
|
|
|
|
7 |
from speechbrain.pretrained import EncoderClassifier
|
8 |
from transformers import AutoProcessor, AutoModelForTextToSpectrogram, SpeechT5HifiGan
|
9 |
|
@@ -29,10 +31,13 @@ def create_speaker_embedding(waveform):
|
|
29 |
|
30 |
def prepare_data(temp_text, temp_audio):
|
31 |
rate, audio_data = temp_audio
|
|
|
|
|
|
|
32 |
example = processor(
|
33 |
text=temp_text,
|
34 |
audio_target=audio_data,
|
35 |
-
sampling_rate=
|
36 |
return_attention_mask=False,)
|
37 |
example["speaker_embeddings"] = create_speaker_embedding(audio_data)
|
38 |
example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
|
|
|
4 |
import torch
|
5 |
import os
|
6 |
import torch
|
7 |
+
from scipy.io import wavfile
|
8 |
+
import scipy.signal as sps
|
9 |
from speechbrain.pretrained import EncoderClassifier
|
10 |
from transformers import AutoProcessor, AutoModelForTextToSpectrogram, SpeechT5HifiGan
|
11 |
|
|
|
31 |
|
32 |
def prepare_data(temp_text, temp_audio):
|
33 |
rate, audio_data = temp_audio
|
34 |
+
new_rate = 16000
|
35 |
+
number_of_samples = round(len(audio_data) * float(new_rate) / rate)
|
36 |
+
audio_data = sps.resample(audio_data, number_of_samples)
|
37 |
example = processor(
|
38 |
text=temp_text,
|
39 |
audio_target=audio_data,
|
40 |
+
sampling_rate=16000,
|
41 |
return_attention_mask=False,)
|
42 |
example["speaker_embeddings"] = create_speaker_embedding(audio_data)
|
43 |
example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
|