salomonsky commited on
Commit
4058233
verified
1 Parent(s): 8f5c027

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -26
app.py CHANGED
@@ -8,52 +8,83 @@ import os
8
 
9
  class VoiceAssistant:
10
  def __init__(self):
11
- # Cargar modelo Wav2Vec2 para reconocimiento de voz en espa帽ol
12
  self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53-spanish")
13
  self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53-spanish")
14
 
15
- # Par谩metros de audio
16
  self.sample_rate = 16000
17
  self.chunk_size = 480
 
18
  self.p = pyaudio.PyAudio()
19
- self.stream = self.p.open(format=pyaudio.paFloat32, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
 
 
 
 
 
 
 
 
 
20
 
21
- # Palabras clave
22
  self.keyword_activation = "jarvis"
23
  self.keyword_deactivation = "detente"
24
 
25
- # Estado de escucha
26
  self.listening = False
27
-
28
- def vad_collector(self, vad_threshold=0.5):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  audio_chunks, keyword_detected = [], False
30
  while self.listening:
31
- data = self.stream.read(self.chunk_size)
32
- audio_chunk = np.frombuffer(data, dtype=np.float32)
33
-
34
- # Detectar palabra de activaci贸n
35
- if self.keyword_activation.lower() in str(audio_chunk).lower():
36
- keyword_detected = True
37
- break
38
-
39
- # Detectar palabra de desactivaci贸n
40
- if self.keyword_deactivation.lower() in str(audio_chunk).lower():
41
- self.listening = False
 
 
 
 
42
  break
43
-
44
- audio_chunks.append(audio_chunk)
45
 
46
  return audio_chunks, keyword_detected
47
 
48
  def transcribe_audio(self, audio_chunks):
49
  audio_data = np.concatenate(audio_chunks)
50
 
51
- # Procesar y transcribir el audio usando Wav2Vec2
52
  input_values = self.processor(audio_data, return_tensors="pt", sampling_rate=self.sample_rate).input_values
53
  with torch.no_grad():
54
  logits = self.model(input_values).logits
55
 
56
- # Decodificar la transcripci贸n
57
  predicted_ids = torch.argmax(logits, dim=-1)
58
  transcription = self.processor.decode(predicted_ids[0])
59
 
@@ -63,7 +94,6 @@ class VoiceAssistant:
63
  return "Respuesta generada para: " + text
64
 
65
  def text_to_speech(self, text):
66
- # Usar gTTS para convertir texto a voz
67
  tts = gTTS(text=text, lang='es')
68
  output_path = "response.mp3"
69
  tts.save(output_path)
@@ -72,12 +102,10 @@ class VoiceAssistant:
72
  def run(self):
73
  st.title("Asistente de Voz JARVIS")
74
 
75
- # Bot贸n para iniciar/desactivar la escucha
76
  if st.button("Iniciar/Detener Escucha"):
77
  self.listening = not self.listening
78
  st.write("Escucha activada." if self.listening else "Escucha desactivada.")
79
 
80
- # Realizar la transcripci贸n y s铆ntesis de voz si la escucha est谩 activada
81
  if self.listening:
82
  audio_chunks, keyword_detected = self.vad_collector()
83
 
@@ -97,4 +125,4 @@ def main():
97
  assistant.run()
98
 
99
  if __name__ == "__main__":
100
- main()
 
8
 
9
  class VoiceAssistant:
10
  def __init__(self):
 
11
  self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53-spanish")
12
  self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53-spanish")
13
 
 
14
  self.sample_rate = 16000
15
  self.chunk_size = 480
16
+
17
  self.p = pyaudio.PyAudio()
18
+ self.input_device_index = self.select_input_device()
19
+
20
+ self.stream = self.p.open(
21
+ format=pyaudio.paFloat32,
22
+ channels=1,
23
+ rate=self.sample_rate,
24
+ input=True,
25
+ input_device_index=self.input_device_index,
26
+ frames_per_buffer=self.chunk_size
27
+ )
28
 
 
29
  self.keyword_activation = "jarvis"
30
  self.keyword_deactivation = "detente"
31
 
 
32
  self.listening = False
33
+
34
+ def select_input_device(self):
35
+ for i in range(self.p.get_device_count()):
36
+ dev = self.p.get_device_info_by_index(i)
37
+ if dev['maxInputChannels'] > 0:
38
+ print(f"Dispositivo {i}: {dev['name']}")
39
+
40
+ for i in range(self.p.get_device_count()):
41
+ dev = self.p.get_device_info_by_index(i)
42
+ if dev['maxInputChannels'] > 0:
43
+ try:
44
+ test_stream = self.p.open(
45
+ format=pyaudio.paFloat32,
46
+ channels=1,
47
+ rate=self.sample_rate,
48
+ input=True,
49
+ input_device_index=i,
50
+ frames_per_buffer=self.chunk_size
51
+ )
52
+ test_stream.close()
53
+ return i
54
+ except Exception:
55
+ continue
56
+
57
+ raise RuntimeError("No input device found")
58
+
59
+ def vad_collector(self):
60
  audio_chunks, keyword_detected = [], False
61
  while self.listening:
62
+ try:
63
+ data = self.stream.read(self.chunk_size)
64
+ audio_chunk = np.frombuffer(data, dtype=np.float32)
65
+
66
+ if self.keyword_activation.lower() in str(audio_chunk).lower():
67
+ keyword_detected = True
68
+ break
69
+
70
+ if self.keyword_deactivation.lower() in str(audio_chunk).lower():
71
+ self.listening = False
72
+ break
73
+
74
+ audio_chunks.append(audio_chunk)
75
+ except Exception as e:
76
+ st.error(f"Audio capture error: {e}")
77
  break
 
 
78
 
79
  return audio_chunks, keyword_detected
80
 
81
  def transcribe_audio(self, audio_chunks):
82
  audio_data = np.concatenate(audio_chunks)
83
 
 
84
  input_values = self.processor(audio_data, return_tensors="pt", sampling_rate=self.sample_rate).input_values
85
  with torch.no_grad():
86
  logits = self.model(input_values).logits
87
 
 
88
  predicted_ids = torch.argmax(logits, dim=-1)
89
  transcription = self.processor.decode(predicted_ids[0])
90
 
 
94
  return "Respuesta generada para: " + text
95
 
96
  def text_to_speech(self, text):
 
97
  tts = gTTS(text=text, lang='es')
98
  output_path = "response.mp3"
99
  tts.save(output_path)
 
102
  def run(self):
103
  st.title("Asistente de Voz JARVIS")
104
 
 
105
  if st.button("Iniciar/Detener Escucha"):
106
  self.listening = not self.listening
107
  st.write("Escucha activada." if self.listening else "Escucha desactivada.")
108
 
 
109
  if self.listening:
110
  audio_chunks, keyword_detected = self.vad_collector()
111
 
 
125
  assistant.run()
126
 
127
  if __name__ == "__main__":
128
+ main()