Spaces:

Prasada
/

DDP

Runtime error

App Files Files Community

Prasada commited on Oct 11, 2023

Commit

6748343

1 Parent(s): 34fabd3

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -23

app.py CHANGED Viewed

@@ -4,14 +4,18 @@ import numpy as np
 import torch
 import os
 import torch
 from scipy.io import wavfile
 import scipy.signal as sps
-from speechbrain.pretrained import EncoderClassifier
-from transformers import AutoProcessor, AutoModelForTextToSpectrogram, SpeechT5HifiGan
-processor = AutoProcessor.from_pretrained("microsoft/speecht5_tts")
-model = AutoModelForTextToSpectrogram.from_pretrained("microsoft/speecht5_tts")
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
@@ -31,9 +35,9 @@ def create_speaker_embedding(waveform):
 def prepare_data(temp_text, temp_audio):
     rate, audio_data = temp_audio
-    new_rate = 16000
-    number_of_samples = round(len(audio_data) * float(new_rate) / rate)
-    audio_data = sps.resample(audio_data, number_of_samples)
     example = processor(
         text=temp_text,
         audio_target=audio_data,
@@ -43,30 +47,54 @@ def prepare_data(temp_text, temp_audio):
     example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
     return example_embeddings
-def predict(temp_text, temp_audio, text):
-    text = text
     embeddings=prepare_data(temp_text, temp_audio)
     inputs = processor(text=text, return_tensors="pt")
     spectrogram = model.generate_speech(inputs["input_ids"], embeddings)
     with torch.no_grad():
         speech = vocoder(spectrogram)
-    speech = (speech.numpy() * 32767).astype(np.int16)
-    return (16000, speech)
-gr.Interface(
-    fn=predict,
-    inputs=[
-        gr.Text(label="Template Text"),
-        gr.Audio(label="Template Speech.", type="numpy"),
-        gr.Text(label="Input Text"),
-    ],
-    outputs=[
-        gr.Audio(label="Generated Speech", type="numpy"),
-    ],
-).launch()

 import torch
 import os
 import torch
+from speechbrain.pretrained import EncoderClassifier
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from scipy.io import wavfile
 import scipy.signal as sps
+import openai as ai
+import gc
+checkpoint = "microsoft/speecht5_tts"
+processor = SpeechT5Processor.from_pretrained(checkpoint)
+model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+ai.api_key = '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'
 spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
 def prepare_data(temp_text, temp_audio):
     rate, audio_data = temp_audio
+    # new_rate = 16000
+    # number_of_samples = round(len(audio_data) * float(new_rate) / rate)
+    # audio_data = sps.resample(audio_data, number_of_samples)
     example = processor(
         text=temp_text,
         audio_target=audio_data,
     example_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
     return example_embeddings
+def generate_gpt4_response(user_text, print_output=False):
+    """
+    Query OpenAI GPT-4 for the specific key and get back a response
+    :type user_text: str the user's text to query for
+    :type print_output: boolean whether or not to print the raw output JSON
+    """
+    message=[{"role": "user", "content": user_text+'in just 2 very small sentences'}]
+    completions = ai.ChatCompletion.create(
+         model="gpt-4",
+         messages=message,
+         max_tokens=250
+     )
+    # Return the first choice's text
+    return completions['choices'][0]['message']['content']
+def predict(temp_text, temp_audio, record_audio_prompt, prompt_text):
+    audio_prompt = audio_prompt if temp_audio is not None else record_audio_prompt
+    text = generate_gpt4_response(prompt_text)
     embeddings=prepare_data(temp_text, temp_audio)
     inputs = processor(text=text, return_tensors="pt")
     spectrogram = model.generate_speech(inputs["input_ids"], embeddings)
     with torch.no_grad():
         speech = vocoder(spectrogram)
+    speech = (speech.numpy() * 32767).astype(np.int16)
+    speech=(16000, speech)
+    del temp_text, temp_audio, record_audio_prompt, prompt_text, audio_prompt,embeddings,inputs,spectrogram
+    gc.collect()
+    return text, speech
+app = gr.Blocks()
+with app:
+  with gr.Row():
+    with gr.Column():
+      temp_text=gr.Text(label="Template Text")
+      temp_audio=gr.Audio(label="Template Speech", type="numpy")
+      prompt_text=gr.Text(label="Input Text")
+      record_audio_prompt = gr.Audio(label='recorded audio prompt', source='microphone', interactive=True)
+    with gr.Column():
+      text = gr.Textbox(label="Message")
+      speech=gr.Audio(label="Generated Speech", type="numpy")
+      btn = gr.Button("Generate!")
+      btn.click(predict,
+                inputs=[temp_text, temp_audio, record_audio_prompt, prompt_text],
+                outputs=[text, speech])
+app.launch()