Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,52 +8,83 @@ import os
|
|
8 |
|
9 |
class VoiceAssistant:
|
10 |
def __init__(self):
|
11 |
-
# Cargar modelo Wav2Vec2 para reconocimiento de voz en espa帽ol
|
12 |
self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53-spanish")
|
13 |
self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53-spanish")
|
14 |
|
15 |
-
# Par谩metros de audio
|
16 |
self.sample_rate = 16000
|
17 |
self.chunk_size = 480
|
|
|
18 |
self.p = pyaudio.PyAudio()
|
19 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
# Palabras clave
|
22 |
self.keyword_activation = "jarvis"
|
23 |
self.keyword_deactivation = "detente"
|
24 |
|
25 |
-
# Estado de escucha
|
26 |
self.listening = False
|
27 |
-
|
28 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
audio_chunks, keyword_detected = [], False
|
30 |
while self.listening:
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
42 |
break
|
43 |
-
|
44 |
-
audio_chunks.append(audio_chunk)
|
45 |
|
46 |
return audio_chunks, keyword_detected
|
47 |
|
48 |
def transcribe_audio(self, audio_chunks):
|
49 |
audio_data = np.concatenate(audio_chunks)
|
50 |
|
51 |
-
# Procesar y transcribir el audio usando Wav2Vec2
|
52 |
input_values = self.processor(audio_data, return_tensors="pt", sampling_rate=self.sample_rate).input_values
|
53 |
with torch.no_grad():
|
54 |
logits = self.model(input_values).logits
|
55 |
|
56 |
-
# Decodificar la transcripci贸n
|
57 |
predicted_ids = torch.argmax(logits, dim=-1)
|
58 |
transcription = self.processor.decode(predicted_ids[0])
|
59 |
|
@@ -63,7 +94,6 @@ class VoiceAssistant:
|
|
63 |
return "Respuesta generada para: " + text
|
64 |
|
65 |
def text_to_speech(self, text):
|
66 |
-
# Usar gTTS para convertir texto a voz
|
67 |
tts = gTTS(text=text, lang='es')
|
68 |
output_path = "response.mp3"
|
69 |
tts.save(output_path)
|
@@ -72,12 +102,10 @@ class VoiceAssistant:
|
|
72 |
def run(self):
|
73 |
st.title("Asistente de Voz JARVIS")
|
74 |
|
75 |
-
# Bot贸n para iniciar/desactivar la escucha
|
76 |
if st.button("Iniciar/Detener Escucha"):
|
77 |
self.listening = not self.listening
|
78 |
st.write("Escucha activada." if self.listening else "Escucha desactivada.")
|
79 |
|
80 |
-
# Realizar la transcripci贸n y s铆ntesis de voz si la escucha est谩 activada
|
81 |
if self.listening:
|
82 |
audio_chunks, keyword_detected = self.vad_collector()
|
83 |
|
@@ -97,4 +125,4 @@ def main():
|
|
97 |
assistant.run()
|
98 |
|
99 |
if __name__ == "__main__":
|
100 |
-
main()
|
|
|
8 |
|
9 |
class VoiceAssistant:
|
10 |
def __init__(self):
|
|
|
11 |
self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53-spanish")
|
12 |
self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53-spanish")
|
13 |
|
|
|
14 |
self.sample_rate = 16000
|
15 |
self.chunk_size = 480
|
16 |
+
|
17 |
self.p = pyaudio.PyAudio()
|
18 |
+
self.input_device_index = self.select_input_device()
|
19 |
+
|
20 |
+
self.stream = self.p.open(
|
21 |
+
format=pyaudio.paFloat32,
|
22 |
+
channels=1,
|
23 |
+
rate=self.sample_rate,
|
24 |
+
input=True,
|
25 |
+
input_device_index=self.input_device_index,
|
26 |
+
frames_per_buffer=self.chunk_size
|
27 |
+
)
|
28 |
|
|
|
29 |
self.keyword_activation = "jarvis"
|
30 |
self.keyword_deactivation = "detente"
|
31 |
|
|
|
32 |
self.listening = False
|
33 |
+
|
34 |
+
def select_input_device(self):
|
35 |
+
for i in range(self.p.get_device_count()):
|
36 |
+
dev = self.p.get_device_info_by_index(i)
|
37 |
+
if dev['maxInputChannels'] > 0:
|
38 |
+
print(f"Dispositivo {i}: {dev['name']}")
|
39 |
+
|
40 |
+
for i in range(self.p.get_device_count()):
|
41 |
+
dev = self.p.get_device_info_by_index(i)
|
42 |
+
if dev['maxInputChannels'] > 0:
|
43 |
+
try:
|
44 |
+
test_stream = self.p.open(
|
45 |
+
format=pyaudio.paFloat32,
|
46 |
+
channels=1,
|
47 |
+
rate=self.sample_rate,
|
48 |
+
input=True,
|
49 |
+
input_device_index=i,
|
50 |
+
frames_per_buffer=self.chunk_size
|
51 |
+
)
|
52 |
+
test_stream.close()
|
53 |
+
return i
|
54 |
+
except Exception:
|
55 |
+
continue
|
56 |
+
|
57 |
+
raise RuntimeError("No input device found")
|
58 |
+
|
59 |
+
def vad_collector(self):
|
60 |
audio_chunks, keyword_detected = [], False
|
61 |
while self.listening:
|
62 |
+
try:
|
63 |
+
data = self.stream.read(self.chunk_size)
|
64 |
+
audio_chunk = np.frombuffer(data, dtype=np.float32)
|
65 |
+
|
66 |
+
if self.keyword_activation.lower() in str(audio_chunk).lower():
|
67 |
+
keyword_detected = True
|
68 |
+
break
|
69 |
+
|
70 |
+
if self.keyword_deactivation.lower() in str(audio_chunk).lower():
|
71 |
+
self.listening = False
|
72 |
+
break
|
73 |
+
|
74 |
+
audio_chunks.append(audio_chunk)
|
75 |
+
except Exception as e:
|
76 |
+
st.error(f"Audio capture error: {e}")
|
77 |
break
|
|
|
|
|
78 |
|
79 |
return audio_chunks, keyword_detected
|
80 |
|
81 |
def transcribe_audio(self, audio_chunks):
|
82 |
audio_data = np.concatenate(audio_chunks)
|
83 |
|
|
|
84 |
input_values = self.processor(audio_data, return_tensors="pt", sampling_rate=self.sample_rate).input_values
|
85 |
with torch.no_grad():
|
86 |
logits = self.model(input_values).logits
|
87 |
|
|
|
88 |
predicted_ids = torch.argmax(logits, dim=-1)
|
89 |
transcription = self.processor.decode(predicted_ids[0])
|
90 |
|
|
|
94 |
return "Respuesta generada para: " + text
|
95 |
|
96 |
def text_to_speech(self, text):
|
|
|
97 |
tts = gTTS(text=text, lang='es')
|
98 |
output_path = "response.mp3"
|
99 |
tts.save(output_path)
|
|
|
102 |
def run(self):
|
103 |
st.title("Asistente de Voz JARVIS")
|
104 |
|
|
|
105 |
if st.button("Iniciar/Detener Escucha"):
|
106 |
self.listening = not self.listening
|
107 |
st.write("Escucha activada." if self.listening else "Escucha desactivada.")
|
108 |
|
|
|
109 |
if self.listening:
|
110 |
audio_chunks, keyword_detected = self.vad_collector()
|
111 |
|
|
|
125 |
assistant.run()
|
126 |
|
127 |
if __name__ == "__main__":
|
128 |
+
main()
|