Update app.py
Browse files
app.py
CHANGED
|
@@ -4,6 +4,8 @@ import numpy as np
|
|
| 4 |
import torch
|
| 5 |
import os
|
| 6 |
import torch
|
|
|
|
|
|
|
| 7 |
from speechbrain.pretrained import EncoderClassifier
|
| 8 |
from transformers import AutoProcessor, AutoModelForTextToSpectrogram, SpeechT5HifiGan
|
| 9 |
|
|
@@ -29,10 +31,13 @@ def create_speaker_embedding(waveform):
|
|
| 29 |
|
| 30 |
def prepare_data(temp_text, temp_audio):
|
| 31 |
rate, audio_data = temp_audio
|
|
|
|
|
|
|
|
|
|
| 32 |
example = processor(
|
| 33 |
text=temp_text,
|
| 34 |
audio_target=audio_data,
|
| 35 |
-
sampling_rate=
|
| 36 |
return_attention_mask=False,)
|
| 37 |
example["speaker_embeddings"] = create_speaker_embedding(audio_data)
|
| 38 |
example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
|
|
|
|
| 4 |
import torch
|
| 5 |
import os
|
| 6 |
import torch
|
| 7 |
+
from scipy.io import wavfile
|
| 8 |
+
import scipy.signal as sps
|
| 9 |
from speechbrain.pretrained import EncoderClassifier
|
| 10 |
from transformers import AutoProcessor, AutoModelForTextToSpectrogram, SpeechT5HifiGan
|
| 11 |
|
|
|
|
| 31 |
|
| 32 |
def prepare_data(temp_text, temp_audio):
|
| 33 |
rate, audio_data = temp_audio
|
| 34 |
+
new_rate = 16000
|
| 35 |
+
number_of_samples = round(len(audio_data) * float(new_rate) / rate)
|
| 36 |
+
audio_data = sps.resample(audio_data, number_of_samples)
|
| 37 |
example = processor(
|
| 38 |
text=temp_text,
|
| 39 |
audio_target=audio_data,
|
| 40 |
+
sampling_rate=16000,
|
| 41 |
return_attention_mask=False,)
|
| 42 |
example["speaker_embeddings"] = create_speaker_embedding(audio_data)
|
| 43 |
example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
|