salomonsky commited on
Commit
72e1090
verified
1 Parent(s): 686ef78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -22
app.py CHANGED
@@ -3,7 +3,7 @@ import base64
3
  import io
4
  from huggingface_hub import InferenceClient
5
  from gtts import gTTS
6
- from audiorecorder import audiorecorder
7
  import speech_recognition as sr
8
  from pydub import AudioSegment
9
 
@@ -11,15 +11,10 @@ if "history" not in st.session_state:
11
  st.session_state.history = []
12
 
13
  recognizer = sr.Recognizer()
14
- microphone = sr.Microphone()
15
 
16
- # reconociendo voz
17
  def recognize_speech_with_vad(audio_data, show_messages=True):
18
  try:
19
- with sr.AudioFile(audio_data) as source:
20
- audio_data = recognizer.record(source, vad_enabled=True)
21
- st.success("Fin de la grabaci贸n. Procesando audio...")
22
-
23
  audio_text = recognizer.recognize_google(audio_data, language="es-ES")
24
 
25
  if show_messages:
@@ -35,7 +30,7 @@ def recognize_speech_with_vad(audio_data, show_messages=True):
35
 
36
  return audio_text
37
 
38
- # preparando entrada para el modelo de lenguaje
39
  def format_prompt(message, history):
40
  prompt = "<s>"
41
 
@@ -46,7 +41,7 @@ def format_prompt(message, history):
46
  prompt += f"[INST] {message} [/INST]"
47
  return prompt
48
 
49
- # generando respuesta en texto
50
  def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
51
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
52
 
@@ -75,7 +70,7 @@ def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.
75
  audio_file = text_to_speech(response, speed=1.3)
76
  return response, audio_file
77
 
78
- # texto a voz
79
  def text_to_speech(text, speed=1.3):
80
  tts = gTTS(text=text, lang='es')
81
  audio_fp = io.BytesIO()
@@ -88,28 +83,34 @@ def text_to_speech(text, speed=1.3):
88
  modified_audio_fp.seek(0)
89
  return modified_audio_fp
90
 
91
- # reproductor de texto a voz
92
  def audio_player_markup(audio_file):
93
  return f"""
94
  <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
95
  """
96
 
97
- # interfaz de usuario
98
  def main():
99
  st.title("Chatbot de Voz a Voz")
100
- audio_data = audiorecorder("Deteniendo la grabaci贸n...", vad_enabled=True)
 
 
 
 
 
 
 
 
101
 
102
- if not audio_data.empty():
103
- st.audio(audio_data.export().read(), format="audio/wav")
104
- audio_data.export("audio.wav", format="wav")
105
- audio_text = recognize_speech_with_vad("audio.wav")
106
 
107
- if audio_text:
108
- st.success("Frase detectada. Procesando audio...")
109
- output, audio_file = generate(audio_text, history=st.session_state.history)
110
 
111
- if audio_file is not None:
112
- st.markdown(audio_player_markup(audio_file), unsafe_allow_html=True)
113
 
114
  if __name__ == "__main__":
115
  main()
 
3
  import io
4
  from huggingface_hub import InferenceClient
5
  from gtts import gTTS
6
+ import sounddevice as sd
7
  import speech_recognition as sr
8
  from pydub import AudioSegment
9
 
 
11
  st.session_state.history = []
12
 
13
  recognizer = sr.Recognizer()
 
14
 
15
+ # Reconociendo voz con VAD
16
  def recognize_speech_with_vad(audio_data, show_messages=True):
17
  try:
 
 
 
 
18
  audio_text = recognizer.recognize_google(audio_data, language="es-ES")
19
 
20
  if show_messages:
 
30
 
31
  return audio_text
32
 
33
+ # Preparando entrada para el modelo de lenguaje
34
  def format_prompt(message, history):
35
  prompt = "<s>"
36
 
 
41
  prompt += f"[INST] {message} [/INST]"
42
  return prompt
43
 
44
+ # Generando respuesta en texto
45
  def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
46
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
47
 
 
70
  audio_file = text_to_speech(response, speed=1.3)
71
  return response, audio_file
72
 
73
+ # Texto a voz
74
  def text_to_speech(text, speed=1.3):
75
  tts = gTTS(text=text, lang='es')
76
  audio_fp = io.BytesIO()
 
83
  modified_audio_fp.seek(0)
84
  return modified_audio_fp
85
 
86
+ # Reproductor de texto a voz
87
  def audio_player_markup(audio_file):
88
  return f"""
89
  <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
90
  """
91
 
92
+ # Interfaz de usuario
93
  def main():
94
  st.title("Chatbot de Voz a Voz")
95
+
96
+ # Configuraci贸n de dispositivos de entrada
97
+ input_devices = sd.query_devices(kind='input')
98
+ selected_device = st.selectbox("Selecciona tu micr贸fono:", [device['name'] for device in input_devices])
99
+
100
+ # Captura de audio con sounddevice
101
+ with sd.InputStream(device=input_devices[st.session_state.selected_device]['index'], channels=2, dtype='int16', callback=None):
102
+ audio_data = sd.rec(int(44100 * 5), samplerate=44100, channels=2, dtype='int16')
103
+ sd.wait()
104
 
105
+ st.audio(audio_data, format="audio/wav")
106
+ audio_text = recognize_speech_with_vad(audio_data)
 
 
107
 
108
+ if audio_text:
109
+ st.success("Frase detectada. Procesando audio...")
110
+ output, audio_file = generate(audio_text, history=st.session_state.history)
111
 
112
+ if audio_file is not None:
113
+ st.markdown(audio_player_markup(audio_file), unsafe_allow_html=True)
114
 
115
  if __name__ == "__main__":
116
  main()