File size: 4,053 Bytes
b035836
 
 
 
 
 
6748343
 
34fabd3
 
6748343
 
5fe6640
b035836
6748343
 
 
b035836
6748343
b035836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
450c254
 
6748343
 
 
b035836
 
 
34fabd3
b035836
 
 
 
 
17fca91
 
 
 
 
 
 
 
 
 
 
 
6748343
17fca91
 
b035836
6748343
 
450c254
 
 
 
 
17fca91
 
450c254
b035836
 
 
 
 
1e25145
6748343
 
 
 
 
b035836
6748343
 
 
 
b035836
6748343
 
 
5fe6640
6748343
 
 
 
 
 
 
5fe6640
 
 
 
 
b035836
6748343
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gradio as gr
import librosa
import numpy as np
import torch
import os
import torch
from speechbrain.pretrained import EncoderClassifier
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from scipy.io import wavfile
import scipy.signal as sps
import openai as ai
import gc
from examples import *

checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)
model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
ai.api_key = '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'


spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name))

def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

def prepare_data(temp_text, audio_prompt):
    rate, audio_data = audio_prompt
    # new_rate = 16000
    # number_of_samples = round(len(audio_data) * float(new_rate) / rate)
    # audio_data = sps.resample(audio_data, number_of_samples)
    example = processor(
        text=temp_text,
        audio_target=audio_data,
        sampling_rate=16000,
        return_attention_mask=False,)
    example["speaker_embeddings"] = create_speaker_embedding(audio_data)
    example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
    return example_embeddings

# def generate_gpt4_response(user_text, print_output=False):
#     """
#     Query OpenAI GPT-4 for the specific key and get back a response
#     :type user_text: str the user's text to query for
#     :type print_output: boolean whether or not to print the raw output JSON
#     """
#     message=[{"role": "user", "content": user_text+'in just 2 very small sentences'}]
#     completions = ai.ChatCompletion.create(
#          model="gpt-4",
#          messages=message,
#          max_tokens=250
#      )

#     # Return the first choice's text
#     return completions['choices'][0]['message']['content']


def predict(temp_text, temp_audio, record_audio_prompt, prompt_text):
    if temp_audio is not None :
      audio_prompt = temp_audio 
    else:
      audio_prompt = record_audio_prompt
    
    # text = generate_gpt4_response(prompt_text)
    text=prompt_text
    embeddings=prepare_data(temp_text, audio_prompt)
    inputs = processor(text=text, return_tensors="pt")
    spectrogram = model.generate_speech(inputs["input_ids"], embeddings)

    with torch.no_grad():
        speech = vocoder(spectrogram)

    speech = (speech.numpy() * 32767).astype(np.int16)
    speech=(16000, speech)
    del temp_text, temp_audio, record_audio_prompt, prompt_text, audio_prompt,embeddings,inputs,spectrogram
    gc.collect()
    return text, speech

app = gr.Blocks()
with app:
  with gr.Row():
    with gr.Column():

      temp_text=gr.Text(label="Template Text")
      temp_audio=gr.Audio(label="Template Speech", type="numpy")
      prompt_text=gr.Text(label="Input Text")
      record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', type="numpy")
    with gr.Column():
      text = gr.Textbox(label="Message")
      speech=gr.Audio(label="Generated Speech", type="numpy")
      btn = gr.Button("Generate!")
      btn.click(predict,
                inputs=[temp_text, temp_audio, record_audio_prompt, prompt_text],
                outputs=[text, speech])
  gr.Examples(examples=infer_from_audio_examples,
                inputs=[temp_text, temp_audio, record_audio_prompt, prompt_text],
                outputs=[text, speech],
                fn=predict,
                cache_examples=False,)

app.launch()