salomonsky commited on
Commit
0ca4c37
verified
1 Parent(s): a835363

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -86
app.py CHANGED
@@ -1,86 +1,8 @@
1
- import gradio as gr
2
- import torch
3
- import numpy as np
4
- import json
5
- import pyaudio
6
- from huggingface_hub import hf_hub_download
7
- from vosk import Model, KaldiRecognizer
8
- from TTS.api import TTS
9
- from scipy.io import wavfile
10
-
11
- class VoiceAssistant:
12
- def __init__(self):
13
- self.vad_model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)
14
- model_path = hf_hub_download(repo_id="alphacep/vosk-model-small-es", filename="model.zip")
15
- self.vosk_model = Model(model_path)
16
- self.tts_model = TTS(model_name="tts_models/es/css10/full-dataset", progress_bar=False)
17
- self.sample_rate = 16000
18
- self.chunk_size = 480
19
- self.p = pyaudio.PyAudio()
20
- self.stream = self.p.open(format=pyaudio.paFloat32, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
21
- self.keyword = "jarvis"
22
-
23
- def vad_collector(self, vad_threshold=0.5):
24
- audio_chunks, keyword_detected = [], False
25
- while True:
26
- data = self.stream.read(self.chunk_size)
27
- audio_chunk = np.frombuffer(data, dtype=np.float32)
28
- speech_prob = self.vad_model(torch.from_numpy(audio_chunk), self.sample_rate).item()
29
-
30
- if speech_prob > vad_threshold:
31
- audio_chunks.append(audio_chunk)
32
- recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
33
- recognizer.AcceptWaveform(audio_chunk.tobytes())
34
- result = json.loads(recognizer.Result())
35
-
36
- if self.keyword.lower() in result.get('text', '').lower():
37
- keyword_detected = True
38
- break
39
-
40
- if keyword_detected:
41
- break
42
-
43
- return audio_chunks, keyword_detected
44
-
45
- def transcribe_audio(self, audio_chunks):
46
- audio_data = np.concatenate(audio_chunks)
47
- recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
48
- recognizer.AcceptWaveform(audio_data.tobytes())
49
- result = json.loads(recognizer.Result())
50
- return result.get('text', '')
51
-
52
- def generate_response(self, text):
53
- return "Respuesta generada para: " + text
54
-
55
- def text_to_speech(self, text):
56
- output_path = "response.wav"
57
- self.tts_model.tts_to_file(text=text, file_path=output_path)
58
- return output_path
59
-
60
- def process_audio():
61
- assistant = VoiceAssistant()
62
- audio_chunks, keyword_detected = assistant.vad_collector()
63
-
64
- if keyword_detected:
65
- transcribed_text = assistant.transcribe_audio(audio_chunks)
66
- response = assistant.generate_response(transcribed_text)
67
- audio_path = assistant.text_to_speech(response)
68
- return transcribed_text, response, audio_path
69
- else:
70
- return "No se detect贸 la palabra clave.", "", ""
71
-
72
- iface = gr.Interface(
73
- fn=process_audio,
74
- inputs=[],
75
- outputs=[
76
- gr.Textbox(label="Texto Transcrito"),
77
- gr.Textbox(label="Respuesta Generada"),
78
- gr.Audio(label="Audio Generado")
79
- ],
80
- live=True,
81
- title="Asistente de Voz JARVIS",
82
- description="Presiona el bot贸n para comenzar la escucha y decir 'JARVIS'."
83
- )
84
-
85
- if __name__ == "__main__":
86
- iface.launch()
 
1
+ streamlit==1.29.0
2
+ torch==2.1.2
3
+ numpy==1.22.0
4
+ huggingface_hub==0.20.3
5
+ transformers==4.36.2
6
+ sounddevice==0.4.6
7
+ TTS==0.22.0
8
+ pyaudio==0.2.14