File size: 3,700 Bytes
b035836 6748343 34fabd3 6748343 b035836 6748343 b035836 6748343 b035836 f7a3a17 6748343 b035836 34fabd3 b035836 6748343 b035836 6748343 b035836 1e25145 6748343 b035836 6748343 b035836 6748343 b035836 6748343 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import gradio as gr
import librosa
import numpy as np
import torch
import os
import torch
from speechbrain.pretrained import EncoderClassifier
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from scipy.io import wavfile
import scipy.signal as sps
import openai as ai
import gc
checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
ai.api_key = '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'
spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
source=spk_model_name,
run_opts={"device": device},
savedir=os.path.join("/tmp", spk_model_name))
def create_speaker_embedding(waveform):
with torch.no_grad():
speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
return speaker_embeddings
def prepare_data(temp_text, temp_audio):
rate, audio_data = temp_audio
# new_rate = 16000
# number_of_samples = round(len(audio_data) * float(new_rate) / rate)
# audio_data = sps.resample(audio_data, number_of_samples)
example = processor(
text=temp_text,
audio_target=audio_data,
sampling_rate=16000,
return_attention_mask=False,)
example["speaker_embeddings"] = create_speaker_embedding(audio_data)
example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
return example_embeddings
def generate_gpt4_response(user_text, print_output=False):
"""
Query OpenAI GPT-4 for the specific key and get back a response
:type user_text: str the user's text to query for
:type print_output: boolean whether or not to print the raw output JSON
"""
message=[{"role": "user", "content": user_text+'in just 2 very small sentences'}]
completions = ai.ChatCompletion.create(
model="gpt-4",
messages=message,
max_tokens=250
)
# Return the first choice's text
return completions['choices'][0]['message']['content']
def predict(temp_text, temp_audio, record_audio_prompt, prompt_text):
audio_prompt = audio_prompt if temp_audio is not None else record_audio_prompt
text = generate_gpt4_response(prompt_text)
embeddings=prepare_data(temp_text, temp_audio)
inputs = processor(text=text, return_tensors="pt")
spectrogram = model.generate_speech(inputs["input_ids"], embeddings)
with torch.no_grad():
speech = vocoder(spectrogram)
speech = (speech.numpy() * 32767).astype(np.int16)
speech=(16000, speech)
del temp_text, temp_audio, record_audio_prompt, prompt_text, audio_prompt,embeddings,inputs,spectrogram
gc.collect()
return text, speech
app = gr.Blocks()
with app:
with gr.Row():
with gr.Column():
temp_text=gr.Text(label="Template Text")
temp_audio=gr.Audio(label="Template Speech", type="numpy")
prompt_text=gr.Text(label="Input Text")
record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
with gr.Column():
text = gr.Textbox(label="Message")
speech=gr.Audio(label="Generated Speech", type="numpy")
btn = gr.Button("Generate!")
btn.click(predict,
inputs=[temp_text, temp_audio, record_audio_prompt, prompt_text],
outputs=[text, speech])
app.launch() |