salomonsky commited on
Commit
eeaa144
verified
1 Parent(s): cbd9627

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -22
app.py CHANGED
@@ -3,9 +3,12 @@ import base64
3
  import io
4
  from huggingface_hub import InferenceClient
5
  from gtts import gTTS
6
- import sounddevice as sd
7
- import speech_recognition as sr
8
  from pydub import AudioSegment
 
 
 
 
 
9
 
10
  if "history" not in st.session_state:
11
  st.session_state.history = []
@@ -29,7 +32,35 @@ def recognize_speech_with_vad(audio_data, show_messages=True):
29
  audio_text = ""
30
 
31
  return audio_text
32
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # Preparando entrada para el modelo de lenguaje
34
  def format_prompt(message, history):
35
  prompt = "<s>"
@@ -89,28 +120,16 @@ def audio_player_markup(audio_file):
89
  <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
90
  """
91
 
92
- # Interfaz de usuario
93
  def main():
94
  st.title("Chatbot de Voz a Voz")
95
-
96
- # Configuraci贸n de dispositivos de entrada
97
- input_devices = sd.query_devices(kind='input')
98
- selected_device = st.selectbox("Selecciona tu micr贸fono:", [device['name'] for device in input_devices])
99
 
100
- # Captura de audio con sounddevice
101
- with sd.InputStream(device=input_devices[st.session_state.selected_device]['index'], channels=2, dtype='int16', callback=None):
102
- audio_data = sd.rec(int(44100 * 5), samplerate=44100, channels=2, dtype='int16')
103
- sd.wait()
104
-
105
- st.audio(audio_data, format="audio/wav")
106
- audio_text = recognize_speech_with_vad(audio_data)
107
-
108
- if audio_text:
109
- st.success("Frase detectada. Procesando audio...")
110
- output, audio_file = generate(audio_text, history=st.session_state.history)
111
-
112
- if audio_file is not None:
113
- st.markdown(audio_player_markup(audio_file), unsafe_allow_html=True)
114
 
115
  if __name__ == "__main__":
116
  main()
 
3
  import io
4
  from huggingface_hub import InferenceClient
5
  from gtts import gTTS
 
 
6
  from pydub import AudioSegment
7
+ from pydub.playback import play
8
+ from streamlit_webrtc import webrtc_streamer, VideoProcessorBase
9
+ import cv2
10
+ import numpy as np
11
+ import speech_recognition as sr
12
 
13
  if "history" not in st.session_state:
14
  st.session_state.history = []
 
32
  audio_text = ""
33
 
34
  return audio_text
35
+
36
+ # Procesador de video para VAD con streamlit_webrtc
37
+ class VADProcessor(AudioProcessorBase):
38
+ def __init__(self):
39
+ self.buffer = np.zeros((0,))
40
+ self.vad_active = True
41
+
42
+ def recv(self, audio_data):
43
+ if self.vad_active:
44
+ audio_array = np.frombuffer(audio_data, dtype=np.int16)
45
+ self.buffer = np.concatenate((self.buffer, audio_array), axis=None)
46
+
47
+ if len(self.buffer) >= 44100 * 5: # 5 seconds of audio
48
+ st.audio(self.buffer, format="audio/wav")
49
+ audio_text = recognize_speech_with_vad(self.buffer)
50
+
51
+ if audio_text:
52
+ st.success("Frase detectada. Procesando audio...")
53
+ output, audio_file = generate(audio_text, history=st.session_state.history)
54
+
55
+ if audio_file is not None:
56
+ play(audio_file)
57
+
58
+ # Desactiva el VAD despu茅s de detectar una frase
59
+ self.vad_active = False
60
+
61
+ self.buffer = np.zeros((0,))
62
+
63
+
64
  # Preparando entrada para el modelo de lenguaje
65
  def format_prompt(message, history):
66
  prompt = "<s>"
 
120
  <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
121
  """
122
 
123
+ # Interfaz de usuario con streamlit_webrtc
124
  def main():
125
  st.title("Chatbot de Voz a Voz")
 
 
 
 
126
 
127
+ webrtc_ctx = webrtc_streamer(
128
+ key="vad",
129
+ audio_processor_factory=VADProcessor,
130
+ async_processing=True,
131
+ media_stream_constraints={"video": False, "audio": True},
132
+ )
 
 
 
 
 
 
 
 
133
 
134
  if __name__ == "__main__":
135
  main()