Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,16 +1,15 @@
|
|
1 |
-
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
2 |
-
import streamlit as st
|
3 |
-
import base64
|
4 |
-
import io
|
5 |
from huggingface_hub import InferenceClient
|
6 |
from audiorecorder import audiorecorder
|
7 |
import speech_recognition as sr
|
|
|
8 |
from datasets import load_dataset
|
|
|
|
|
|
|
9 |
import torch
|
|
|
10 |
|
11 |
-
|
12 |
-
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
13 |
-
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
14 |
pre_prompt_text = "You are a behavioral AI, your answers should be brief, stoic and humanistic."
|
15 |
|
16 |
if "history" not in st.session_state:
|
@@ -18,7 +17,6 @@ if "history" not in st.session_state:
|
|
18 |
|
19 |
if "pre_prompt_sent" not in st.session_state:
|
20 |
st.session_state.pre_prompt_sent = False
|
21 |
-
|
22 |
def recognize_speech(audio_data, show_messages=True):
|
23 |
recognizer = sr.Recognizer()
|
24 |
audio_recording = sr.AudioFile(audio_data)
|
@@ -78,17 +76,14 @@ def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.
|
|
78 |
response += response_token.token.text
|
79 |
|
80 |
response = ' '.join(response.split()).replace('</s>', '')
|
81 |
-
audio_file = text_to_speech(response)
|
82 |
return response, audio_file
|
83 |
|
84 |
def text_to_speech(text):
|
85 |
-
|
|
|
86 |
|
87 |
-
|
88 |
-
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
89 |
-
speech = model.generate_speech(inputs.input_ids, speaker_embeddings, vocoder=vocoder)
|
90 |
-
with sf.SoundFile("speech.wav", "w", 16000, 1) as file:
|
91 |
-
file.write(speech.cpu().numpy().astype('float32'))
|
92 |
|
93 |
with open("speech.wav", "rb") as audio_file:
|
94 |
encoded_audio = base64.b64encode(audio_file.read()).decode()
|
|
|
|
|
|
|
|
|
|
|
1 |
from huggingface_hub import InferenceClient
|
2 |
from audiorecorder import audiorecorder
|
3 |
import speech_recognition as sr
|
4 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
5 |
from datasets import load_dataset
|
6 |
+
import streamlit as st
|
7 |
+
import base64
|
8 |
+
import io
|
9 |
import torch
|
10 |
+
import os
|
11 |
|
12 |
+
model = Wav2Vec2ForCTC.from_pretrained("facebook/mms-tts-spa")
|
|
|
|
|
13 |
pre_prompt_text = "You are a behavioral AI, your answers should be brief, stoic and humanistic."
|
14 |
|
15 |
if "history" not in st.session_state:
|
|
|
17 |
|
18 |
if "pre_prompt_sent" not in st.session_state:
|
19 |
st.session_state.pre_prompt_sent = False
|
|
|
20 |
def recognize_speech(audio_data, show_messages=True):
|
21 |
recognizer = sr.Recognizer()
|
22 |
audio_recording = sr.AudioFile(audio_data)
|
|
|
76 |
response += response_token.token.text
|
77 |
|
78 |
response = ' '.join(response.split()).replace('</s>', '')
|
79 |
+
audio_file = text_to_speech(response, speed=1.3)
|
80 |
return response, audio_file
|
81 |
|
82 |
def text_to_speech(text):
|
83 |
+
with torch.no_grad():
|
84 |
+
logits = model.generate(text)
|
85 |
|
86 |
+
sf.write("speech.wav", logits.numpy(), samplerate=16000)
|
|
|
|
|
|
|
|
|
87 |
|
88 |
with open("speech.wav", "rb") as audio_file:
|
89 |
encoded_audio = base64.b64encode(audio_file.read()).decode()
|