salomonsky commited on
Commit
a202e44
verified
1 Parent(s): 9d7f79d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -44
app.py CHANGED
@@ -1,15 +1,18 @@
1
- import streamlit as st
2
- import base64
3
- import io
4
  from huggingface_hub import InferenceClient
5
  from audiorecorder import audiorecorder
6
  import speech_recognition as sr
7
- from pydub import AudioSegment
8
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 
9
  import torch
 
 
 
 
10
 
 
 
11
  tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa")
12
- model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mms-tts-spa")
13
 
14
  pre_prompt_text = "Eres una IA conductual, tus respuestas deber谩n ser breves, est贸icas y humanistas."
15
 
@@ -55,42 +58,13 @@ def format_prompt(message, history):
55
  prompt += f"[INST] {message} [/INST]"
56
  return prompt
57
 
58
- def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
59
- client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
60
-
61
- temperature = float(temperature) if temperature is not None else 0.9
62
- temperature = max(temperature, 1e-2)
63
- top_p = float(top_p)
64
-
65
- generate_kwargs = dict(
66
- temperature=temperature,
67
- max_new_tokens=max_new_tokens,
68
- top_p=top_p,
69
- repetition_penalty=repetition_penalty,
70
- do_sample=True,
71
- seed=42)
72
-
73
  formatted_prompt = format_prompt(audio_text, history)
74
- stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
75
- response = ""
76
-
77
- for response_token in stream:
78
- response += response_token.token.text
79
-
80
- response = ' '.join(response.split()).replace('</s>', '')
81
- audio_file = text_to_speech(response, speed=1.3)
82
- return response, audio_file
83
-
84
- def text_to_speech(text, speed=1.3):
85
- inputs = tokenizer(text, return_tensors="pt", padding="longest", truncation=True)
86
  with torch.no_grad():
87
  output = model.generate(**inputs)
88
- audio = output[0]
89
- audio_bytes = io.BytesIO()
90
- audio_tensor = torch.tensor(audio).float()
91
- torch.save(audio_tensor, audio_bytes)
92
- audio_bytes.seek(0)
93
- return base64.b64encode(audio_bytes.read()).decode()
94
 
95
  def main():
96
  audio_data = audiorecorder("Presiona para hablar", "Deteniendo la grabaci贸n...")
@@ -101,12 +75,24 @@ def main():
101
  audio_text = recognize_speech("audio.wav")
102
 
103
  if audio_text:
104
- response, audio_file = generate(audio_text, history=st.session_state.history)
105
 
106
  if audio_file is not None:
107
- st.markdown(
108
- f"""<audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>""",
109
- unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  if __name__ == "__main__":
112
- main()
 
 
 
 
1
  from huggingface_hub import InferenceClient
2
  from audiorecorder import audiorecorder
3
  import speech_recognition as sr
4
+ import streamlit as st
5
+ import base64
6
+ import io
7
  import torch
8
+ from transformers import VitsProcessor, VitsModel, AutoTokenizer
9
+ import numpy as np
10
+ from scipy.io.wavfile import write
11
+ from pydub import AudioSegment
12
 
13
+ processor = VitsProcessor.from_pretrained("facebook/mms-tts-spa")
14
+ model = VitsModel.from_pretrained("facebook/mms-tts-spa")
15
  tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa")
 
16
 
17
  pre_prompt_text = "Eres una IA conductual, tus respuestas deber谩n ser breves, est贸icas y humanistas."
18
 
 
58
  prompt += f"[INST] {message} [/INST]"
59
  return prompt
60
 
61
+ def generate(audio_text, history):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  formatted_prompt = format_prompt(audio_text, history)
63
+ inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=True, truncation=True)
 
 
 
 
 
 
 
 
 
 
 
64
  with torch.no_grad():
65
  output = model.generate(**inputs)
66
+ audio = output['audio']
67
+ return audio
 
 
 
 
68
 
69
  def main():
70
  audio_data = audiorecorder("Presiona para hablar", "Deteniendo la grabaci贸n...")
 
75
  audio_text = recognize_speech("audio.wav")
76
 
77
  if audio_text:
78
+ audio_file = generate(audio_text, history=st.session_state.history)
79
 
80
  if audio_file is not None:
81
+ # Guardar el archivo WAV
82
+ write("output.wav", processor.feature_extractor.sampling_rate, audio_file)
83
+
84
+ # Convertir el archivo WAV a MP3 utilizando pydub
85
+ audio = AudioSegment.from_wav("output.wav")
86
+ audio.export("output.mp3", format="mp3")
87
+
88
+ # Leer el archivo MP3 y mostrarlo en Streamlit
89
+ with open("output.mp3", "rb") as file:
90
+ audio_bytes = file.read()
91
+ st.audio(audio_bytes, format="audio/mp3")
92
+
93
+ # Eliminar archivos temporales (opcional)
94
+ os.remove("output.wav")
95
+ os.remove("output.mp3")
96
 
97
  if __name__ == "__main__":
98
+ main()