salomonsky commited on
Commit
7ce7fe0
verified
1 Parent(s): 4b2a14d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -39
app.py CHANGED
@@ -1,36 +1,39 @@
1
  import streamlit as st
 
 
 
2
  import base64
3
  import io
4
  from huggingface_hub import InferenceClient
5
  from gtts import gTTS
6
- from audiorecorder import audiorecorder
7
- import speech_recognition as sr
8
- from pydub import AudioSegment
9
 
10
  if "history" not in st.session_state:
11
  st.session_state.history = []
12
 
13
- def callback(recognizer, audio):
14
-
 
 
 
 
 
 
 
 
 
15
  try:
16
- audio_text = recognizer.recognize_google(audio, language="es-ES")
17
- st.subheader("Texto Reconocido:")
18
- st.write(audio_text)
19
- st.success("Reconocimiento de voz completado.")
20
- output, audio_file = generate(audio_text, history=st.session_state.history)
21
- if audio_text:
22
- st.session_state.history.append((audio_text, output))
23
- if audio_file is not None:
24
- st.markdown(
25
- f"""
26
- <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
27
- """,
28
- unsafe_allow_html=True
29
- )
30
- except sr.UnknownValueError:
31
- st.warning("No se pudo reconocer el audio.")
32
- except sr.RequestError as e:
33
- st.error(f"Error al solicitar resultados de Google Speech Recognition: {e}")
34
 
35
  def format_prompt(message, history):
36
  prompt = "<s>"
@@ -65,7 +68,7 @@ def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.
65
 
66
  for response_token in stream:
67
  response += response_token.token.text
68
-
69
  response = ' '.join(response.split()).replace('</s>', '')
70
  audio_file = text_to_speech(response, speed=1.3)
71
  return response, audio_file
@@ -75,29 +78,38 @@ def text_to_speech(text, speed=1.3):
75
  audio_fp = io.BytesIO()
76
  tts.write_to_fp(audio_fp)
77
  audio_fp.seek(0)
78
- audio = AudioSegment.from_file(audio_fp, format="mp3")
79
  modified_speed_audio = audio.speedup(playback_speed=speed)
80
  modified_audio_fp = io.BytesIO()
81
  modified_speed_audio.export(modified_audio_fp, format="mp3")
82
  modified_audio_fp.seek(0)
83
  return modified_audio_fp
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  def main():
86
  st.title("Chatbot de Voz a Voz")
87
- microphones = sr.Microphone.list_microphone_names()
88
-
89
- if microphones:
90
- selected_microphone_index = 1 if len(microphones) > 1 else 0
91
- audio_data = sr.Microphone(device_index=selected_microphone_index)
92
- else:
93
- st.warning("No se encontraron dispositivos.")
94
 
95
- r = sr.Recognizer()
96
- m = sr.Microphone(device_index=selected_microphone_index)
97
- with m as source:
98
- r.adjust_for_ambient_noise(source)
99
- st.info("Di algo para comenzar la conversaci贸n.")
100
- stop_listening = r.listen_in_background(m, callback)
 
 
101
 
102
  if __name__ == "__main__":
103
- main()
 
1
  import streamlit as st
2
+ import webrtcvad
3
+ import pydub
4
+ import numpy as np
5
  import base64
6
  import io
7
  from huggingface_hub import InferenceClient
8
  from gtts import gTTS
9
+
10
+ st.title("Chatbot de Voz a Voz")
 
11
 
12
  if "history" not in st.session_state:
13
  st.session_state.history = []
14
 
15
+ vad = webrtcvad.Vad()
16
+ vad.set_mode(3)
17
+
18
+ buffer = []
19
+ frames_per_buffer = 480
20
+ audio_rate = 16000
21
+ channels = 1
22
+ seconds_per_frame = frames_per_buffer / audio_rate
23
+ vad_threshold = 0.5
24
+
25
+ def callback(data):
26
  try:
27
+ audio_array = np.frombuffer(data, dtype=np.int16)
28
+ is_speech = vad.is_speech(data, sample_rate=audio_rate)
29
+
30
+ buffer.append(audio_array)
31
+
32
+ if not is_speech:
33
+ save_audio_buffer()
34
+
35
+ except Exception as e:
36
+ st.error(f"Error durante la captura de audio: {e}")
 
 
 
 
 
 
 
 
37
 
38
  def format_prompt(message, history):
39
  prompt = "<s>"
 
68
 
69
  for response_token in stream:
70
  response += response_token.token.text
71
+
72
  response = ' '.join(response.split()).replace('</s>', '')
73
  audio_file = text_to_speech(response, speed=1.3)
74
  return response, audio_file
 
78
  audio_fp = io.BytesIO()
79
  tts.write_to_fp(audio_fp)
80
  audio_fp.seek(0)
81
+ audio = pydub.AudioSegment.from_file(audio_fp, format="mp3")
82
  modified_speed_audio = audio.speedup(playback_speed=speed)
83
  modified_audio_fp = io.BytesIO()
84
  modified_speed_audio.export(modified_audio_fp, format="mp3")
85
  modified_audio_fp.seek(0)
86
  return modified_audio_fp
87
 
88
+ def save_audio_buffer():
89
+ if buffer:
90
+ audio_array = np.concatenate(buffer)
91
+ audio_segment = pydub.AudioSegment(
92
+ audio_array.tobytes(),
93
+ frame_rate=audio_rate,
94
+ sample_width=audio_array.dtype.itemsize,
95
+ channels=channels,
96
+ )
97
+
98
+ st.audio(audio_array, format="audio/wav", channels=channels)
99
+
100
+ buffer.clear()
101
+
102
  def main():
103
  st.title("Chatbot de Voz a Voz")
 
 
 
 
 
 
 
104
 
105
+ st._webrtc_audio_recorder(
106
+ key="audio",
107
+ sample_rate=audio_rate,
108
+ channels=channels,
109
+ format="pcm",
110
+ on_data=callback,
111
+ start_streaming=st.button("Iniciar/Detener Grabaci贸n"),
112
+ )
113
 
114
  if __name__ == "__main__":
115
+ main()