salomonsky commited on
Commit
39f14d4
·
verified ·
1 Parent(s): 50a3067

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -38
app.py CHANGED
@@ -3,9 +3,9 @@ import base64
3
  import io
4
  from huggingface_hub import InferenceClient
5
  from gtts import gTTS
 
6
  import speech_recognition as sr
7
  from pydub import AudioSegment
8
- from audiorecorder import audiorecorder
9
 
10
  pre_prompt_text = "eres una IA conductual, tus respuestas serán breves."
11
 
@@ -17,31 +17,26 @@ if "pre_prompt_sent" not in st.session_state:
17
 
18
  def recognize_speech(audio_data, show_messages=True):
19
  recognizer = sr.Recognizer()
 
20
 
 
 
21
  try:
22
- audio_data.seek(0)
23
- audio = gTTS(audio_data.read(), lang='es')
24
- audio_recording = sr.AudioFile(io.BytesIO(audio_data.read()))
25
-
26
- with audio_recording as source:
27
- audio = recognizer.record(source)
28
-
29
  audio_text = recognizer.recognize_google(audio, language="es-ES")
30
  if show_messages:
31
  st.subheader("Texto Reconocido:")
32
  st.write(audio_text)
33
- st.success("Reconocimiento completado.")
34
  except sr.UnknownValueError:
35
- st.warning("¡Habla fuerte y claro!")
36
  audio_text = ""
37
  except sr.RequestError:
38
- st.error("¡Háblame para comenzar!")
39
  audio_text = ""
40
 
41
  return audio_text
42
 
43
  def format_prompt(message, history):
44
- preprompt = "Este es el inicio de la conversación. "
45
  prompt = "<s>"
46
 
47
  for user_prompt, bot_response in history:
@@ -49,9 +44,9 @@ def format_prompt(message, history):
49
  prompt += f" {bot_response}</s> "
50
 
51
  prompt += f"[INST] {message} [/INST]"
52
- return preprompt + prompt
53
 
54
- def generate(audio_text, history, temperature=None, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0):
55
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
56
 
57
  temperature = float(temperature) if temperature is not None else 0.9
@@ -74,9 +69,9 @@ def generate(audio_text, history, temperature=None, max_new_tokens=256, top_p=0.
74
 
75
  for response_token in stream:
76
  response += response_token.token.text
77
-
78
  response = ' '.join(response.split()).replace('</s>', '')
79
- audio_file = text_to_speech(response)
80
  return response, audio_file
81
 
82
  def text_to_speech(text, speed=1.3):
@@ -84,41 +79,39 @@ def text_to_speech(text, speed=1.3):
84
  audio_fp = io.BytesIO()
85
  tts.write_to_fp(audio_fp)
86
  audio_fp.seek(0)
87
- audio = AudioSegment.from_file(audio_fp, format="mp3")
88
  modified_speed_audio = audio.speedup(playback_speed=speed)
89
  modified_audio_fp = io.BytesIO()
90
  modified_speed_audio.export(modified_audio_fp, format="mp3")
91
  modified_audio_fp.seek(0)
92
  return modified_audio_fp
93
 
94
- def play_audio(audio_file):
95
- audio_data = audio_file.read()
96
- b64_audio = base64.b64encode(audio_data).decode("utf-8")
97
- st.audio(f"data:audio/mp3;base64,{b64_audio}", format="audio/mp3", start_time=0)
98
 
99
- def main():
100
- st.title("Chatbot de Voz a Voz")
101
- audio_text = ""
102
 
 
 
 
 
 
 
103
  if not st.session_state.pre_prompt_sent:
104
  st.session_state.pre_prompt_sent = True
105
- st.session_state.history.append((pre_prompt_text, ""))
106
- pre_prompt_audio_file = text_to_speech(pre_prompt_text)
107
- play_audio(pre_prompt_audio_file)
108
-
109
- audio_data = audiorecorder.record("Hablar ▶️", "Detener 🛑")
110
 
111
- if not audio_data.empty():
112
- st.audio(audio_data.export().read(), format="audio/mp3")
 
 
 
113
  audio_text = recognize_speech(audio_data)
114
-
115
  if audio_text:
116
- output, audio_file = generate(audio_text, history=st.session_state.history)
117
-
118
- if audio_text:
119
- st.session_state.history.append((audio_text, output))
120
- if audio_file is not None:
121
- play_audio(audio_file)
122
 
123
  if __name__ == "__main__":
124
  main()
 
3
  import io
4
  from huggingface_hub import InferenceClient
5
  from gtts import gTTS
6
+ from audiorecorder import audiorecorder
7
  import speech_recognition as sr
8
  from pydub import AudioSegment
 
9
 
10
  pre_prompt_text = "eres una IA conductual, tus respuestas serán breves."
11
 
 
17
 
18
  def recognize_speech(audio_data, show_messages=True):
19
  recognizer = sr.Recognizer()
20
+ audio_recording = sr.AudioFile(audio_data)
21
 
22
+ with audio_recording as source:
23
+ audio = recognizer.record(source)
24
  try:
 
 
 
 
 
 
 
25
  audio_text = recognizer.recognize_google(audio, language="es-ES")
26
  if show_messages:
27
  st.subheader("Texto Reconocido:")
28
  st.write(audio_text)
29
+ st.success("Reconocimiento de voz completado.")
30
  except sr.UnknownValueError:
31
+ st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
32
  audio_text = ""
33
  except sr.RequestError:
34
+ st.error("Háblame para comenzar!")
35
  audio_text = ""
36
 
37
  return audio_text
38
 
39
  def format_prompt(message, history):
 
40
  prompt = "<s>"
41
 
42
  for user_prompt, bot_response in history:
 
44
  prompt += f" {bot_response}</s> "
45
 
46
  prompt += f"[INST] {message} [/INST]"
47
+ return prompt
48
 
49
+ def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
50
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
51
 
52
  temperature = float(temperature) if temperature is not None else 0.9
 
69
 
70
  for response_token in stream:
71
  response += response_token.token.text
72
+
73
  response = ' '.join(response.split()).replace('</s>', '')
74
+ audio_file = text_to_speech(response, speed=1.3)
75
  return response, audio_file
76
 
77
  def text_to_speech(text, speed=1.3):
 
79
  audio_fp = io.BytesIO()
80
  tts.write_to_fp(audio_fp)
81
  audio_fp.seek(0)
82
+ audio = AudioSegment.from_mp3(audio_fp)
83
  modified_speed_audio = audio.speedup(playback_speed=speed)
84
  modified_audio_fp = io.BytesIO()
85
  modified_speed_audio.export(modified_audio_fp, format="mp3")
86
  modified_audio_fp.seek(0)
87
  return modified_audio_fp
88
 
89
+ def audio_play(audio_fp):
90
+ st.audio(audio_fp.read(), format="audio/mp3", start_time=0)
 
 
91
 
92
+ def display_recognition_result(audio_text, output, audio_file):
93
+ if audio_text:
94
+ st.session_state.history.append((audio_text, output))
95
 
96
+ if audio_file is not None:
97
+ st.markdown(
98
+ f"""<audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>""",
99
+ unsafe_allow_html=True)
100
+
101
+ def main():
102
  if not st.session_state.pre_prompt_sent:
103
  st.session_state.pre_prompt_sent = True
 
 
 
 
 
104
 
105
+ audio_bytes = audiorecorder("Hablar ▶️", "Detener 🛑")
106
+ if audio_bytes:
107
+ st.audio(audio_bytes, format="audio/wav")
108
+ audio_data = io.BytesIO(audio_bytes)
109
+ audio_data.seek(0)
110
  audio_text = recognize_speech(audio_data)
111
+
112
  if audio_text:
113
+ output, audio_file = generate(audio_text, history=st.session_state.history)
114
+ display_recognition_result(audio_text, output, audio_file)
 
 
 
 
115
 
116
  if __name__ == "__main__":
117
  main()