Prasada commited on
Commit
6748343
·
1 Parent(s): 34fabd3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -23
app.py CHANGED
@@ -4,14 +4,18 @@ import numpy as np
4
  import torch
5
  import os
6
  import torch
 
 
7
  from scipy.io import wavfile
8
  import scipy.signal as sps
9
- from speechbrain.pretrained import EncoderClassifier
10
- from transformers import AutoProcessor, AutoModelForTextToSpectrogram, SpeechT5HifiGan
11
 
12
- processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
13
- model = AutoModelForTextToSpectrogram.from_pretrained("microsoft/speecht5_tts")
 
14
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 
15
 
16
 
17
  spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
@@ -31,9 +35,9 @@ def create_speaker_embedding(waveform):
31
 
32
  def prepare_data(temp_text, temp_audio):
33
  rate, audio_data = temp_audio
34
- new_rate = 16000
35
- number_of_samples = round(len(audio_data) * float(new_rate) / rate)
36
- audio_data = sps.resample(audio_data, number_of_samples)
37
  example = processor(
38
  text=temp_text,
39
  audio_target=audio_data,
@@ -43,30 +47,54 @@ def prepare_data(temp_text, temp_audio):
43
  example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
44
  return example_embeddings
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
- def predict(temp_text, temp_audio, text):
48
- text = text
 
 
49
  embeddings=prepare_data(temp_text, temp_audio)
50
  inputs = processor(text=text, return_tensors="pt")
51
  spectrogram = model.generate_speech(inputs["input_ids"], embeddings)
52
 
53
  with torch.no_grad():
54
  speech = vocoder(spectrogram)
55
-
56
- speech = (speech.numpy() * 32767).astype(np.int16)
57
- return (16000, speech)
58
 
 
 
 
 
 
59
 
 
 
 
 
60
 
61
- gr.Interface(
62
- fn=predict,
63
- inputs=[
64
- gr.Text(label="Template Text"),
65
- gr.Audio(label="Template Speech.", type="numpy"),
66
- gr.Text(label="Input Text"),
67
- ],
68
- outputs=[
69
- gr.Audio(label="Generated Speech", type="numpy"),
70
- ],
 
71
 
72
- ).launch()
 
4
  import torch
5
  import os
6
  import torch
7
+ from speechbrain.pretrained import EncoderClassifier
8
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
  from scipy.io import wavfile
10
  import scipy.signal as sps
11
+ import openai as ai
12
+ import gc
13
 
14
+ checkpoint = "microsoft/speecht5_tts"
15
+ processor = SpeechT5Processor.from_pretrained(checkpoint)
16
+ model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
17
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
18
+ ai.api_key = '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'
19
 
20
 
21
  spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
 
35
 
36
  def prepare_data(temp_text, temp_audio):
37
  rate, audio_data = temp_audio
38
+ # new_rate = 16000
39
+ # number_of_samples = round(len(audio_data) * float(new_rate) / rate)
40
+ # audio_data = sps.resample(audio_data, number_of_samples)
41
  example = processor(
42
  text=temp_text,
43
  audio_target=audio_data,
 
47
  example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
48
  return example_embeddings
49
 
50
+ def generate_gpt4_response(user_text, print_output=False):
51
+ """
52
+ Query OpenAI GPT-4 for the specific key and get back a response
53
+ :type user_text: str the user's text to query for
54
+ :type print_output: boolean whether or not to print the raw output JSON
55
+ """
56
+ message=[{"role": "user", "content": user_text+'in just 2 very small sentences'}]
57
+ completions = ai.ChatCompletion.create(
58
+ model="gpt-4",
59
+ messages=message,
60
+ max_tokens=250
61
+ )
62
+
63
+ # Return the first choice's text
64
+ return completions['choices'][0]['message']['content']
65
 
66
+
67
+ def predict(temp_text, temp_audio, record_audio_prompt, prompt_text):
68
+ audio_prompt = audio_prompt if temp_audio is not None else record_audio_prompt
69
+ text = generate_gpt4_response(prompt_text)
70
  embeddings=prepare_data(temp_text, temp_audio)
71
  inputs = processor(text=text, return_tensors="pt")
72
  spectrogram = model.generate_speech(inputs["input_ids"], embeddings)
73
 
74
  with torch.no_grad():
75
  speech = vocoder(spectrogram)
 
 
 
76
 
77
+ speech = (speech.numpy() * 32767).astype(np.int16)
78
+ speech=(16000, speech)
79
+ del temp_text, temp_audio, record_audio_prompt, prompt_text, audio_prompt,embeddings,inputs,spectrogram
80
+ gc.collect()
81
+ return text, speech
82
 
83
+ app = gr.Blocks()
84
+ with app:
85
+ with gr.Row():
86
+ with gr.Column():
87
 
88
+ temp_text=gr.Text(label="Template Text")
89
+ temp_audio=gr.Audio(label="Template Speech", type="numpy")
90
+ prompt_text=gr.Text(label="Input Text")
91
+ record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
92
+ with gr.Column():
93
+ text = gr.Textbox(label="Message")
94
+ speech=gr.Audio(label="Generated Speech", type="numpy")
95
+ btn = gr.Button("Generate!")
96
+ btn.click(predict,
97
+ inputs=[temp_text, temp_audio, record_audio_prompt, prompt_text],
98
+ outputs=[text, speech])
99
 
100
+ app.launch()