Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,7 +3,7 @@ import base64
|
|
3 |
import io
|
4 |
from huggingface_hub import InferenceClient
|
5 |
from gtts import gTTS
|
6 |
-
|
7 |
import speech_recognition as sr
|
8 |
from pydub import AudioSegment
|
9 |
|
@@ -11,15 +11,10 @@ if "history" not in st.session_state:
|
|
11 |
st.session_state.history = []
|
12 |
|
13 |
recognizer = sr.Recognizer()
|
14 |
-
microphone = sr.Microphone()
|
15 |
|
16 |
-
#
|
17 |
def recognize_speech_with_vad(audio_data, show_messages=True):
|
18 |
try:
|
19 |
-
with sr.AudioFile(audio_data) as source:
|
20 |
-
audio_data = recognizer.record(source, vad_enabled=True)
|
21 |
-
st.success("Fin de la grabaci贸n. Procesando audio...")
|
22 |
-
|
23 |
audio_text = recognizer.recognize_google(audio_data, language="es-ES")
|
24 |
|
25 |
if show_messages:
|
@@ -35,7 +30,7 @@ def recognize_speech_with_vad(audio_data, show_messages=True):
|
|
35 |
|
36 |
return audio_text
|
37 |
|
38 |
-
#
|
39 |
def format_prompt(message, history):
|
40 |
prompt = "<s>"
|
41 |
|
@@ -46,7 +41,7 @@ def format_prompt(message, history):
|
|
46 |
prompt += f"[INST] {message} [/INST]"
|
47 |
return prompt
|
48 |
|
49 |
-
#
|
50 |
def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
|
51 |
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
|
52 |
|
@@ -75,7 +70,7 @@ def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.
|
|
75 |
audio_file = text_to_speech(response, speed=1.3)
|
76 |
return response, audio_file
|
77 |
|
78 |
-
#
|
79 |
def text_to_speech(text, speed=1.3):
|
80 |
tts = gTTS(text=text, lang='es')
|
81 |
audio_fp = io.BytesIO()
|
@@ -88,28 +83,34 @@ def text_to_speech(text, speed=1.3):
|
|
88 |
modified_audio_fp.seek(0)
|
89 |
return modified_audio_fp
|
90 |
|
91 |
-
#
|
92 |
def audio_player_markup(audio_file):
|
93 |
return f"""
|
94 |
<audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
|
95 |
"""
|
96 |
|
97 |
-
#
|
98 |
def main():
|
99 |
st.title("Chatbot de Voz a Voz")
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
audio_data.export("audio.wav", format="wav")
|
105 |
-
audio_text = recognize_speech_with_vad("audio.wav")
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
|
111 |
-
|
112 |
-
|
113 |
|
114 |
if __name__ == "__main__":
|
115 |
main()
|
|
|
3 |
import io
|
4 |
from huggingface_hub import InferenceClient
|
5 |
from gtts import gTTS
|
6 |
+
import sounddevice as sd
|
7 |
import speech_recognition as sr
|
8 |
from pydub import AudioSegment
|
9 |
|
|
|
11 |
st.session_state.history = []
|
12 |
|
13 |
recognizer = sr.Recognizer()
|
|
|
14 |
|
15 |
+
# Reconociendo voz con VAD
|
16 |
def recognize_speech_with_vad(audio_data, show_messages=True):
|
17 |
try:
|
|
|
|
|
|
|
|
|
18 |
audio_text = recognizer.recognize_google(audio_data, language="es-ES")
|
19 |
|
20 |
if show_messages:
|
|
|
30 |
|
31 |
return audio_text
|
32 |
|
33 |
+
# Preparando entrada para el modelo de lenguaje
|
34 |
def format_prompt(message, history):
|
35 |
prompt = "<s>"
|
36 |
|
|
|
41 |
prompt += f"[INST] {message} [/INST]"
|
42 |
return prompt
|
43 |
|
44 |
+
# Generando respuesta en texto
|
45 |
def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
|
46 |
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
|
47 |
|
|
|
70 |
audio_file = text_to_speech(response, speed=1.3)
|
71 |
return response, audio_file
|
72 |
|
73 |
+
# Texto a voz
|
74 |
def text_to_speech(text, speed=1.3):
|
75 |
tts = gTTS(text=text, lang='es')
|
76 |
audio_fp = io.BytesIO()
|
|
|
83 |
modified_audio_fp.seek(0)
|
84 |
return modified_audio_fp
|
85 |
|
86 |
+
# Reproductor de texto a voz
|
87 |
def audio_player_markup(audio_file):
|
88 |
return f"""
|
89 |
<audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
|
90 |
"""
|
91 |
|
92 |
+
# Interfaz de usuario
|
93 |
def main():
|
94 |
st.title("Chatbot de Voz a Voz")
|
95 |
+
|
96 |
+
# Configuraci贸n de dispositivos de entrada
|
97 |
+
input_devices = sd.query_devices(kind='input')
|
98 |
+
selected_device = st.selectbox("Selecciona tu micr贸fono:", [device['name'] for device in input_devices])
|
99 |
+
|
100 |
+
# Captura de audio con sounddevice
|
101 |
+
with sd.InputStream(device=input_devices[st.session_state.selected_device]['index'], channels=2, dtype='int16', callback=None):
|
102 |
+
audio_data = sd.rec(int(44100 * 5), samplerate=44100, channels=2, dtype='int16')
|
103 |
+
sd.wait()
|
104 |
|
105 |
+
st.audio(audio_data, format="audio/wav")
|
106 |
+
audio_text = recognize_speech_with_vad(audio_data)
|
|
|
|
|
107 |
|
108 |
+
if audio_text:
|
109 |
+
st.success("Frase detectada. Procesando audio...")
|
110 |
+
output, audio_file = generate(audio_text, history=st.session_state.history)
|
111 |
|
112 |
+
if audio_file is not None:
|
113 |
+
st.markdown(audio_player_markup(audio_file), unsafe_allow_html=True)
|
114 |
|
115 |
if __name__ == "__main__":
|
116 |
main()
|