salomonsky commited on
Commit
f2c3ba6
verified
1 Parent(s): 5e537b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -25
app.py CHANGED
@@ -10,28 +10,33 @@ from pydub import AudioSegment
10
  if "history" not in st.session_state:
11
  st.session_state.history = []
12
 
13
- def recognize_speech(audio_data, show_messages=True):
14
- recognizer = sr.Recognizer()
15
- audio_recording = sr.AudioFile(audio_data)
16
-
17
- with audio_recording as source:
18
- audio = recognizer.record(source)
19
-
20
- try:
21
- audio_text = recognizer.recognize_google(audio, language="es-ES")
22
- if show_messages:
23
- st.subheader("Texto Reconocido:")
24
- st.write(audio_text)
25
- st.success("Reconocimiento de voz completado.")
26
- except sr.UnknownValueError:
27
- st.warning("No se pudo reconocer el audio. 驴Intentaste grabar algo?")
28
- audio_text = ""
29
- except sr.RequestError:
30
- st.error("Hablame para comenzar!")
31
- audio_text = ""
 
 
 
32
 
33
  return audio_text
34
 
 
 
35
  def format_prompt(message, history):
36
  prompt = "<s>"
37
 
@@ -42,6 +47,7 @@ def format_prompt(message, history):
42
  prompt += f"[INST] {message} [/INST]"
43
  return prompt
44
 
 
45
  def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
46
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
47
 
@@ -70,6 +76,7 @@ def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.
70
  audio_file = text_to_speech(response, speed=1.3)
71
  return response, audio_file
72
 
 
73
  def text_to_speech(text, speed=1.3):
74
  tts = gTTS(text=text, lang='es')
75
  audio_fp = io.BytesIO()
@@ -82,6 +89,13 @@ def text_to_speech(text, speed=1.3):
82
  modified_audio_fp.seek(0)
83
  return modified_audio_fp
84
 
 
 
 
 
 
 
 
85
  def main():
86
  st.title("Chatbot de Voz a Voz")
87
  st.info("Habla para grabar...")
@@ -97,12 +111,7 @@ def main():
97
  output, audio_file = generate(audio_text, history=st.session_state.history)
98
 
99
  if audio_file is not None:
100
- st.markdown(
101
- f"""
102
- <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
103
- """,
104
- unsafe_allow_html=True
105
- )
106
 
107
  if __name__ == "__main__":
108
  main()
 
10
  if "history" not in st.session_state:
11
  st.session_state.history = []
12
 
13
+ recognizer = sr.Recognizer()
14
+ microphone = sr.Microphone()
15
+
16
+ # reconociendo voz
17
+ def recognize_speech_with_vad(audio_data, show_messages=True):
18
+ with microphone as source:
19
+ try:
20
+ st.info("Escuchando...")
21
+ audio_data.record(source, vad_enabled=True)
22
+ st.success("Fin de la grabaci贸n. Procesando audio...")
23
+ audio_text = recognizer.recognize_google(audio_data, language="es-ES")
24
+
25
+ if show_messages:
26
+ st.subheader("Texto Reconocido:")
27
+ st.write(audio_text)
28
+
29
+ except sr.UnknownValueError:
30
+ st.warning("No se pudo reconocer el audio. 驴Intentaste grabar algo?")
31
+ audio_text = ""
32
+ except sr.RequestError:
33
+ st.error("Hablame para comenzar!")
34
+ audio_text = ""
35
 
36
  return audio_text
37
 
38
+
39
+ # preparando entrada para el modelo de lenguaje
40
  def format_prompt(message, history):
41
  prompt = "<s>"
42
 
 
47
  prompt += f"[INST] {message} [/INST]"
48
  return prompt
49
 
50
+ # generando respuesta en texto
51
  def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
52
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
53
 
 
76
  audio_file = text_to_speech(response, speed=1.3)
77
  return response, audio_file
78
 
79
+ # texto a voz
80
  def text_to_speech(text, speed=1.3):
81
  tts = gTTS(text=text, lang='es')
82
  audio_fp = io.BytesIO()
 
89
  modified_audio_fp.seek(0)
90
  return modified_audio_fp
91
 
92
+ # reproductor de texto a voz
93
+ def audio_player_markup(audio_file):
94
+ return f"""
95
+ <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
96
+ """
97
+
98
+ # interfaz de usuario
99
  def main():
100
  st.title("Chatbot de Voz a Voz")
101
  st.info("Habla para grabar...")
 
111
  output, audio_file = generate(audio_text, history=st.session_state.history)
112
 
113
  if audio_file is not None:
114
+ st.markdown(audio_player_markup(audio_file), unsafe_allow_html=True)
 
 
 
 
 
115
 
116
  if __name__ == "__main__":
117
  main()