salomonsky commited on
Commit
19c888c
·
verified ·
1 Parent(s): cafffb0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -15
app.py CHANGED
@@ -1,16 +1,15 @@
1
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
2
- import streamlit as st
3
- import base64
4
- import io
5
  from huggingface_hub import InferenceClient
6
  from audiorecorder import audiorecorder
7
  import speech_recognition as sr
 
8
  from datasets import load_dataset
 
 
 
9
  import torch
 
10
 
11
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
12
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
13
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
14
  pre_prompt_text = "You are a behavioral AI, your answers should be brief, stoic and humanistic."
15
 
16
  if "history" not in st.session_state:
@@ -18,7 +17,6 @@ if "history" not in st.session_state:
18
 
19
  if "pre_prompt_sent" not in st.session_state:
20
  st.session_state.pre_prompt_sent = False
21
-
22
  def recognize_speech(audio_data, show_messages=True):
23
  recognizer = sr.Recognizer()
24
  audio_recording = sr.AudioFile(audio_data)
@@ -78,17 +76,14 @@ def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.
78
  response += response_token.token.text
79
 
80
  response = ' '.join(response.split()).replace('</s>', '')
81
- audio_file = text_to_speech(response)
82
  return response, audio_file
83
 
84
  def text_to_speech(text):
85
- inputs = processor(text=text, return_tensors="pt", target_language="es")
 
86
 
87
- embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
88
- speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
89
- speech = model.generate_speech(inputs.input_ids, speaker_embeddings, vocoder=vocoder)
90
- with sf.SoundFile("speech.wav", "w", 16000, 1) as file:
91
- file.write(speech.cpu().numpy().astype('float32'))
92
 
93
  with open("speech.wav", "rb") as audio_file:
94
  encoded_audio = base64.b64encode(audio_file.read()).decode()
 
 
 
 
 
1
  from huggingface_hub import InferenceClient
2
  from audiorecorder import audiorecorder
3
  import speech_recognition as sr
4
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
5
  from datasets import load_dataset
6
+ import streamlit as st
7
+ import base64
8
+ import io
9
  import torch
10
+ import os
11
 
12
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/mms-tts-spa")
 
 
13
  pre_prompt_text = "You are a behavioral AI, your answers should be brief, stoic and humanistic."
14
 
15
  if "history" not in st.session_state:
 
17
 
18
  if "pre_prompt_sent" not in st.session_state:
19
  st.session_state.pre_prompt_sent = False
 
20
  def recognize_speech(audio_data, show_messages=True):
21
  recognizer = sr.Recognizer()
22
  audio_recording = sr.AudioFile(audio_data)
 
76
  response += response_token.token.text
77
 
78
  response = ' '.join(response.split()).replace('</s>', '')
79
+ audio_file = text_to_speech(response, speed=1.3)
80
  return response, audio_file
81
 
82
  def text_to_speech(text):
83
+ with torch.no_grad():
84
+ logits = model.generate(text)
85
 
86
+ sf.write("speech.wav", logits.numpy(), samplerate=16000)
 
 
 
 
87
 
88
  with open("speech.wav", "rb") as audio_file:
89
  encoded_audio = base64.b64encode(audio_file.read()).decode()