salomonsky commited on
Commit
d1c33f9
verified
1 Parent(s): 0ad71a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -38
app.py CHANGED
@@ -1,17 +1,12 @@
1
- import streamlit as st
2
- import base64
3
- import io
4
  import torch
5
  import numpy as np
 
 
6
  from huggingface_hub import hf_hub_download
7
- from scipy.io import wavfile
8
- from scipy.signal import butter, lfilter
9
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
10
  from vosk import Model, KaldiRecognizer
11
- import pyaudio
12
- import json
13
- import sounddevice as sd
14
  from TTS.api import TTS
 
15
 
16
  class VoiceAssistant:
17
  def __init__(self):
@@ -24,7 +19,7 @@ class VoiceAssistant:
24
  self.p = pyaudio.PyAudio()
25
  self.stream = self.p.open(format=pyaudio.paFloat32, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
26
  self.keyword = "jarvis"
27
-
28
  def vad_collector(self, vad_threshold=0.5):
29
  audio_chunks, keyword_detected = [], False
30
  while True:
@@ -37,54 +32,55 @@ class VoiceAssistant:
37
  recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
38
  recognizer.AcceptWaveform(audio_chunk.tobytes())
39
  result = json.loads(recognizer.Result())
40
-
41
  if self.keyword.lower() in result.get('text', '').lower():
42
  keyword_detected = True
43
  break
44
 
45
  if keyword_detected:
46
  break
47
-
48
  return audio_chunks, keyword_detected
49
-
50
  def transcribe_audio(self, audio_chunks):
51
  audio_data = np.concatenate(audio_chunks)
52
  recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
53
  recognizer.AcceptWaveform(audio_data.tobytes())
54
  result = json.loads(recognizer.Result())
55
  return result.get('text', '')
56
-
57
  def generate_response(self, text):
58
  return "Respuesta generada para: " + text
59
-
60
  def text_to_speech(self, text):
61
  output_path = "response.wav"
62
  self.tts_model.tts_to_file(text=text, file_path=output_path)
63
  return output_path
64
-
65
- def run(self):
66
- st.title("Asistente de Voz JARVIS")
67
-
68
- if st.button("Iniciar Escucha"):
69
- st.write("Esperando palabra clave 'JARVIS'...")
70
-
71
- audio_chunks, keyword_detected = self.vad_collector()
72
-
73
- if keyword_detected:
74
- st.success("Palabra clave detectada. Procesando...")
75
-
76
- transcribed_text = self.transcribe_audio(audio_chunks)
77
- st.write(f"Texto transcrito: {transcribed_text}")
78
-
79
- response = self.generate_response(transcribed_text)
80
- st.write(f"Respuesta: {response}")
81
-
82
- audio_path = self.text_to_speech(response)
83
- st.audio(audio_path)
84
 
85
- def main():
86
  assistant = VoiceAssistant()
87
- assistant.run()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  if __name__ == "__main__":
90
- main()
 
1
+ import gradio as gr
 
 
2
  import torch
3
  import numpy as np
4
+ import json
5
+ import pyaudio
6
  from huggingface_hub import hf_hub_download
 
 
 
7
  from vosk import Model, KaldiRecognizer
 
 
 
8
  from TTS.api import TTS
9
+ from scipy.io import wavfile
10
 
11
  class VoiceAssistant:
12
  def __init__(self):
 
19
  self.p = pyaudio.PyAudio()
20
  self.stream = self.p.open(format=pyaudio.paFloat32, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
21
  self.keyword = "jarvis"
22
+
23
  def vad_collector(self, vad_threshold=0.5):
24
  audio_chunks, keyword_detected = [], False
25
  while True:
 
32
  recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
33
  recognizer.AcceptWaveform(audio_chunk.tobytes())
34
  result = json.loads(recognizer.Result())
35
+
36
  if self.keyword.lower() in result.get('text', '').lower():
37
  keyword_detected = True
38
  break
39
 
40
  if keyword_detected:
41
  break
42
+
43
  return audio_chunks, keyword_detected
44
+
45
  def transcribe_audio(self, audio_chunks):
46
  audio_data = np.concatenate(audio_chunks)
47
  recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
48
  recognizer.AcceptWaveform(audio_data.tobytes())
49
  result = json.loads(recognizer.Result())
50
  return result.get('text', '')
51
+
52
  def generate_response(self, text):
53
  return "Respuesta generada para: " + text
54
+
55
  def text_to_speech(self, text):
56
  output_path = "response.wav"
57
  self.tts_model.tts_to_file(text=text, file_path=output_path)
58
  return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ def process_audio():
61
  assistant = VoiceAssistant()
62
+ audio_chunks, keyword_detected = assistant.vad_collector()
63
+
64
+ if keyword_detected:
65
+ transcribed_text = assistant.transcribe_audio(audio_chunks)
66
+ response = assistant.generate_response(transcribed_text)
67
+ audio_path = assistant.text_to_speech(response)
68
+ return transcribed_text, response, audio_path
69
+ else:
70
+ return "No se detect贸 la palabra clave.", "", ""
71
+
72
+ iface = gr.Interface(
73
+ fn=process_audio,
74
+ inputs=[],
75
+ outputs=[
76
+ gr.Textbox(label="Texto Transcrito"),
77
+ gr.Textbox(label="Respuesta Generada"),
78
+ gr.Audio(label="Audio Generado")
79
+ ],
80
+ live=True,
81
+ title="Asistente de Voz JARVIS",
82
+ description="Presiona el bot贸n para comenzar la escucha y decir 'JARVIS'."
83
+ )
84
 
85
  if __name__ == "__main__":
86
+ iface.launch()