Update app.py
Browse files
app.py
CHANGED
@@ -4,14 +4,18 @@ import numpy as np
|
|
4 |
import torch
|
5 |
import os
|
6 |
import torch
|
|
|
|
|
7 |
from scipy.io import wavfile
|
8 |
import scipy.signal as sps
|
9 |
-
|
10 |
-
|
11 |
|
12 |
-
|
13 |
-
|
|
|
14 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
|
|
15 |
|
16 |
|
17 |
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
|
@@ -31,9 +35,9 @@ def create_speaker_embedding(waveform):
|
|
31 |
|
32 |
def prepare_data(temp_text, temp_audio):
|
33 |
rate, audio_data = temp_audio
|
34 |
-
new_rate = 16000
|
35 |
-
number_of_samples = round(len(audio_data) * float(new_rate) / rate)
|
36 |
-
audio_data = sps.resample(audio_data, number_of_samples)
|
37 |
example = processor(
|
38 |
text=temp_text,
|
39 |
audio_target=audio_data,
|
@@ -43,30 +47,54 @@ def prepare_data(temp_text, temp_audio):
|
|
43 |
example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
|
44 |
return example_embeddings
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
-
|
48 |
-
|
|
|
|
|
49 |
embeddings=prepare_data(temp_text, temp_audio)
|
50 |
inputs = processor(text=text, return_tensors="pt")
|
51 |
spectrogram = model.generate_speech(inputs["input_ids"], embeddings)
|
52 |
|
53 |
with torch.no_grad():
|
54 |
speech = vocoder(spectrogram)
|
55 |
-
|
56 |
-
speech = (speech.numpy() * 32767).astype(np.int16)
|
57 |
-
return (16000, speech)
|
58 |
|
|
|
|
|
|
|
|
|
|
|
59 |
|
|
|
|
|
|
|
|
|
60 |
|
61 |
-
gr.
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
71 |
|
72 |
-
|
|
|
4 |
import torch
|
5 |
import os
|
6 |
import torch
|
7 |
+
from speechbrain.pretrained import EncoderClassifier
|
8 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
9 |
from scipy.io import wavfile
|
10 |
import scipy.signal as sps
|
11 |
+
import openai as ai
|
12 |
+
import gc
|
13 |
|
14 |
+
checkpoint = "microsoft/speecht5_tts"
|
15 |
+
processor = SpeechT5Processor.from_pretrained(checkpoint)
|
16 |
+
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
|
17 |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
18 |
+
ai.api_key = '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'
|
19 |
|
20 |
|
21 |
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
|
|
|
35 |
|
36 |
def prepare_data(temp_text, temp_audio):
|
37 |
rate, audio_data = temp_audio
|
38 |
+
# new_rate = 16000
|
39 |
+
# number_of_samples = round(len(audio_data) * float(new_rate) / rate)
|
40 |
+
# audio_data = sps.resample(audio_data, number_of_samples)
|
41 |
example = processor(
|
42 |
text=temp_text,
|
43 |
audio_target=audio_data,
|
|
|
47 |
example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
|
48 |
return example_embeddings
|
49 |
|
50 |
+
def generate_gpt4_response(user_text, print_output=False):
|
51 |
+
"""
|
52 |
+
Query OpenAI GPT-4 for the specific key and get back a response
|
53 |
+
:type user_text: str the user's text to query for
|
54 |
+
:type print_output: boolean whether or not to print the raw output JSON
|
55 |
+
"""
|
56 |
+
message=[{"role": "user", "content": user_text+'in just 2 very small sentences'}]
|
57 |
+
completions = ai.ChatCompletion.create(
|
58 |
+
model="gpt-4",
|
59 |
+
messages=message,
|
60 |
+
max_tokens=250
|
61 |
+
)
|
62 |
+
|
63 |
+
# Return the first choice's text
|
64 |
+
return completions['choices'][0]['message']['content']
|
65 |
|
66 |
+
|
67 |
+
def predict(temp_text, temp_audio, record_audio_prompt, prompt_text):
|
68 |
+
audio_prompt = audio_prompt if temp_audio is not None else record_audio_prompt
|
69 |
+
text = generate_gpt4_response(prompt_text)
|
70 |
embeddings=prepare_data(temp_text, temp_audio)
|
71 |
inputs = processor(text=text, return_tensors="pt")
|
72 |
spectrogram = model.generate_speech(inputs["input_ids"], embeddings)
|
73 |
|
74 |
with torch.no_grad():
|
75 |
speech = vocoder(spectrogram)
|
|
|
|
|
|
|
76 |
|
77 |
+
speech = (speech.numpy() * 32767).astype(np.int16)
|
78 |
+
speech=(16000, speech)
|
79 |
+
del temp_text, temp_audio, record_audio_prompt, prompt_text, audio_prompt,embeddings,inputs,spectrogram
|
80 |
+
gc.collect()
|
81 |
+
return text, speech
|
82 |
|
83 |
+
app = gr.Blocks()
|
84 |
+
with app:
|
85 |
+
with gr.Row():
|
86 |
+
with gr.Column():
|
87 |
|
88 |
+
temp_text=gr.Text(label="Template Text")
|
89 |
+
temp_audio=gr.Audio(label="Template Speech", type="numpy")
|
90 |
+
prompt_text=gr.Text(label="Input Text")
|
91 |
+
record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
|
92 |
+
with gr.Column():
|
93 |
+
text = gr.Textbox(label="Message")
|
94 |
+
speech=gr.Audio(label="Generated Speech", type="numpy")
|
95 |
+
btn = gr.Button("Generate!")
|
96 |
+
btn.click(predict,
|
97 |
+
inputs=[temp_text, temp_audio, record_audio_prompt, prompt_text],
|
98 |
+
outputs=[text, speech])
|
99 |
|
100 |
+
app.launch()
|