salomonsky commited on
Commit
bdccd83
·
verified ·
1 Parent(s): 444f76a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -82
app.py CHANGED
@@ -1,90 +1,109 @@
1
- import tempfile
2
- import webrtcvad
3
- import speech_recognition as sr
4
- import numpy as np
5
  import streamlit as st
6
- import sounddevice as sd
7
- from traceback import format_exc
 
 
 
 
 
 
 
8
 
9
- def update_vad_status(status):
10
- vad_status.text(status)
11
 
12
- def process_microphone_input():
13
- # Configuramos la tasa de muestreo y el tamaño del frame
14
- sample_rate = 16000
15
- frame_size = 30
16
- chunk_size = 1024 # Adjust as needed for responsiveness
17
 
18
- # Creamos un objeto VAD y un reconocedor de voz
19
- vad = webrtcvad.Vad()
20
  recognizer = sr.Recognizer()
 
21
 
22
- # Indicadores de estado
23
- vad_active = False
24
- speech_detected = False
25
- phrase = ""
26
 
27
  try:
28
- # Configuramos la captura de audio desde el micrófono
29
- with sd.InputStream(callback=callback, channels=1, dtype=np.int16):
30
- st.warning("Habla y observa los cambios en tiempo real...")
31
-
32
- # Mantenemos la aplicación en ejecución
33
- st.experimental_rerun()
34
-
35
- except Exception as e:
36
- st.error(f"Error during microphone input: {e}")
37
- st.error("Traceback:")
38
- st.error(format_exc())
39
-
40
- def callback(indata, frames, time, status):
41
- if status:
42
- print(f"Error in callback: {status}")
43
- return
44
-
45
- # Procesamos el audio en chunks
46
- for i in range(0, len(indata), chunk_size):
47
- chunk = indata[i:i + chunk_size]
48
-
49
- # Procesamos cada chunk en frames
50
- for j in range(0, len(chunk), frame_size):
51
- # Obtenemos el frame actual
52
- frame = chunk[j:j + frame_size]
53
-
54
- # Detectamos si hay voz en el frame
55
- is_speech = vad.is_speech(frame, sample_rate)
56
-
57
- # Actualizamos los indicadores de estado
58
- if is_speech and not vad_active:
59
- vad_active = True
60
- speech_detected = True
61
- update_vad_status("️ Detección de voz iniciada")
62
- elif not is_speech and vad_active:
63
- vad_active = False
64
- update_vad_status("⏹️ Detección de voz finalizada")
65
-
66
- # Si se ha detectado voz y hay un silencio, transcribimos la frase
67
- if speech_detected and not is_speech:
68
- # Transcribimos la frase
69
- with sr.AudioData(frame.tobytes(), sample_rate) as source:
70
- audio = recognizer.record(source)
71
- try:
72
- text = recognizer.recognize_google(audio)
73
- phrase += f" {text}"
74
- st.text(f"️ {text}")
75
- except sr.RequestError:
76
- st.error("⚠️ Error al transcribir la frase - RequestError")
77
- except sr.UnknownValueError:
78
- st.error("⚠️ No se ha reconocido la frase - UnknownValueError")
79
-
80
- # Reiniciamos el indicador de frase
81
- speech_detected = False
82
-
83
- # Imprimimos la frase completa
84
- st.success(f"Transcripción completa: {phrase}")
85
-
86
- # Streamlit UI
87
- st.title("VAD and Speech Recognition App (Microphone Input)")
88
-
89
- vad_status = st.empty()
90
- process_microphone_input()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import base64
3
+ import io
4
+ from huggingface_hub import InferenceClient
5
+ from gtts import gTTS
6
+ from audiorecorder import audiorecorder
7
+ import speech_recognition as sr
8
+ from pydub import AudioSegment
9
+
10
+ pre_prompt_text = "Eres una IA conductual, tus respuestas deberán ser breves, estóicas y humanistas."
11
 
12
+ if "history" not in st.session_state:
13
+ st.session_state.history = []
14
 
15
+ if "pre_prompt_sent" not in st.session_state:
16
+ st.session_state.pre_prompt_sent = False
 
 
 
17
 
18
+ def recognize_speech(audio_data, show_messages=True):
 
19
  recognizer = sr.Recognizer()
20
+ audio_recording = sr.AudioFile(audio_data)
21
 
22
+ with audio_recording as source:
23
+ audio = recognizer.record(source)
 
 
24
 
25
  try:
26
+ audio_text = recognizer.recognize_google(audio, language="es-ES")
27
+ if show_messages:
28
+ st.subheader("Texto Reconocido:")
29
+ st.write(audio_text)
30
+ st.success("Reconocimiento de voz completado.")
31
+ except sr.UnknownValueError:
32
+ st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
33
+ audio_text = ""
34
+ except sr.RequestError:
35
+ st.error("Hablame para comenzar!")
36
+ audio_text = ""
37
+
38
+ return audio_text
39
+
40
+ def format_prompt(message, history):
41
+ prompt = "<s>"
42
+
43
+ if not st.session_state.pre_prompt_sent:
44
+ prompt += f"[INST] {pre_prompt_text} [/INST]"
45
+ st.session_state.pre_prompt_sent = True
46
+
47
+ for user_prompt, bot_response in history:
48
+ prompt += f"[INST] {user_prompt} [/INST]"
49
+ prompt += f" {bot_response}</s> "
50
+
51
+ prompt += f"[INST] {message} [/INST]"
52
+ return prompt
53
+
54
+ def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
55
+ client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
56
+
57
+ temperature = float(temperature) if temperature is not None else 0.9
58
+ temperature = max(temperature, 1e-2)
59
+ top_p = float(top_p)
60
+
61
+ generate_kwargs = dict(
62
+ temperature=temperature,
63
+ max_new_tokens=max_new_tokens,
64
+ top_p=top_p,
65
+ repetition_penalty=repetition_penalty,
66
+ do_sample=True,
67
+ seed=42)
68
+
69
+ formatted_prompt = format_prompt(audio_text, history)
70
+ stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
71
+ response = ""
72
+
73
+ for response_token in stream:
74
+ response += response_token.token.text
75
+
76
+ response = ' '.join(response.split()).replace('</s>', '')
77
+ audio_file = text_to_speech(response, speed=1.3)
78
+ return response, audio_file
79
+
80
+ def text_to_speech(text, speed=1.3):
81
+ tts = gTTS(text=text, lang='es')
82
+ audio_fp = io.BytesIO()
83
+ tts.write_to_fp(audio_fp)
84
+ audio_fp.seek(0)
85
+ audio = AudioSegment.from_file(audio_fp, format="mp3")
86
+ modified_speed_audio = audio.speedup(playback_speed=speed)
87
+ modified_audio_fp = io.BytesIO()
88
+ modified_speed_audio.export(modified_audio_fp, format="mp3")
89
+ modified_audio_fp.seek(0)
90
+ return modified_audio_fp
91
+
92
+ def main():
93
+ audio_data = audiorecorder("Presiona para hablar", "Deteniendo la grabación...")
94
+
95
+ if not audio_data.empty():
96
+ st.audio(audio_data.export().read(), format="audio/wav")
97
+ audio_data.export("audio.wav", format="wav")
98
+ audio_text = recognize_speech("audio.wav")
99
+
100
+ if audio_text:
101
+ output, audio_file = generate(audio_text, history=st.session_state.history)
102
+
103
+ if audio_file is not None:
104
+ st.markdown(
105
+ f"""<audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>""",
106
+ unsafe_allow_html=True)
107
+
108
+ if __name__ == "__main__":
109
+ main()