salomonsky commited on
Commit
4024bc6
verified
1 Parent(s): 0ca4c37

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -8
app.py CHANGED
@@ -1,8 +1,103 @@
1
- streamlit==1.29.0
2
- torch==2.1.2
3
- numpy==1.22.0
4
- huggingface_hub==0.20.3
5
- transformers==4.36.2
6
- sounddevice==0.4.6
7
- TTS==0.22.0
8
- pyaudio==0.2.14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import numpy as np
4
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
5
+ import pyaudio
6
+ import sounddevice as sd
7
+ from TTS.api import TTS
8
+
9
+ class VoiceAssistant:
10
+ def __init__(self):
11
+ # Cargar el modelo Wav2Vec2 para reconocimiento de voz en espa帽ol
12
+ self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53-spanish")
13
+ self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53-spanish")
14
+
15
+ # Cargar el modelo TTS (Text-to-Speech)
16
+ self.tts_model = TTS(model_name="tts_models/es/css10/full-dataset", progress_bar=False)
17
+
18
+ # Par谩metros de audio
19
+ self.sample_rate = 16000
20
+ self.chunk_size = 480
21
+ self.p = pyaudio.PyAudio()
22
+ self.stream = self.p.open(format=pyaudio.paFloat32, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
23
+
24
+ # Palabras clave de activaci贸n y desactivaci贸n
25
+ self.keyword_activation = "jarvis"
26
+ self.keyword_deactivation = "detente"
27
+
28
+ # Estado de la escucha
29
+ self.listening = False
30
+
31
+ def vad_collector(self, vad_threshold=0.5):
32
+ audio_chunks, keyword_detected = [], False
33
+ while self.listening:
34
+ data = self.stream.read(self.chunk_size)
35
+ audio_chunk = np.frombuffer(data, dtype=np.float32)
36
+
37
+ # Detectar palabra clave de activaci贸n
38
+ if self.keyword_activation.lower() in str(audio_chunk).lower():
39
+ keyword_detected = True
40
+ break
41
+
42
+ # Detectar palabra clave de desactivaci贸n
43
+ if self.keyword_deactivation.lower() in str(audio_chunk).lower():
44
+ self.listening = False
45
+ break
46
+
47
+ audio_chunks.append(audio_chunk)
48
+
49
+ return audio_chunks, keyword_detected
50
+
51
+ def transcribe_audio(self, audio_chunks):
52
+ audio_data = np.concatenate(audio_chunks)
53
+
54
+ # Preprocesar y transcribir el audio usando Wav2Vec2
55
+ input_values = self.processor(audio_data, return_tensors="pt", sampling_rate=self.sample_rate).input_values
56
+ with torch.no_grad():
57
+ logits = self.model(input_values).logits
58
+
59
+ # Obtener el texto transcrito
60
+ predicted_ids = torch.argmax(logits, dim=-1)
61
+ transcription = self.processor.decode(predicted_ids[0])
62
+
63
+ return transcription
64
+
65
+ def generate_response(self, text):
66
+ return "Respuesta generada para: " + text
67
+
68
+ def text_to_speech(self, text):
69
+ output_path = "response.wav"
70
+ self.tts_model.tts_to_file(text=text, file_path=output_path)
71
+ return output_path
72
+
73
+ def run(self):
74
+ st.title("Asistente de Voz JARVIS")
75
+
76
+ if st.button("Iniciar/Detener Escucha"):
77
+ self.listening = not self.listening
78
+ if self.listening:
79
+ st.write("Escucha activada. Esperando palabra clave 'JARVIS'...")
80
+ else:
81
+ st.write("Escucha desactivada.")
82
+
83
+ if self.listening:
84
+ audio_chunks, keyword_detected = self.vad_collector()
85
+
86
+ if keyword_detected:
87
+ st.success("Palabra clave 'JARVIS' detectada. Procesando...")
88
+
89
+ transcribed_text = self.transcribe_audio(audio_chunks)
90
+ st.write(f"Texto transcrito: {transcribed_text}")
91
+
92
+ response = self.generate_response(transcribed_text)
93
+ st.write(f"Respuesta: {response}")
94
+
95
+ audio_path = self.text_to_speech(response)
96
+ st.audio(audio_path)
97
+
98
+ def main():
99
+ assistant = VoiceAssistant()
100
+ assistant.run()
101
+
102
+ if __name__ == "__main__":
103
+ main()