Spaces:
Running
Running
File size: 4,355 Bytes
7d2c473 23076a4 3784695 56f4168 719c3e8 86a2629 eeaa144 23076a4 a75c652 8b2bfa8 f2c3ba6 72e1090 f2c3ba6 686ef78 f2c3ba6 686ef78 9ab5c4d 686ef78 f2c3ba6 686ef78 eeaa144 72e1090 093b41a 3e64240 093b41a a75c652 093b41a a75c652 3e64240 8307fd0 72e1090 23076a4 3625f99 23076a4 093b41a 23076a4 da45dce 8307fd0 da45dce 86a2629 da45dce 8f22654 da45dce 89ff019 72e1090 8f22654 23076a4 464ced7 23076a4 3784695 72e1090 f2c3ba6 eeaa144 b39afbd 86a2629 72e1090 eeaa144 b39afbd c8a52a4 7ce7fe0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import streamlit as st
import base64
import io
from huggingface_hub import InferenceClient
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play
from streamlit_webrtc import webrtc_streamer, VideoProcessorBase
import cv2
import numpy as np
import speech_recognition as sr
if "history" not in st.session_state:
st.session_state.history = []
recognizer = sr.Recognizer()
# Reconociendo voz con VAD
def recognize_speech_with_vad(audio_data, show_messages=True):
try:
audio_text = recognizer.recognize_google(audio_data, language="es-ES")
if show_messages:
st.subheader("Texto Reconocido:")
st.write(audio_text)
except sr.UnknownValueError:
st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
audio_text = ""
except sr.RequestError:
st.error("Hablame para comenzar!")
audio_text = ""
return audio_text
# Procesador de video para VAD con streamlit_webrtc
class VADProcessor(AudioProcessorBase):
def __init__(self):
self.buffer = np.zeros((0,))
self.vad_active = True
def recv(self, audio_data):
if self.vad_active:
audio_array = np.frombuffer(audio_data, dtype=np.int16)
self.buffer = np.concatenate((self.buffer, audio_array), axis=None)
if len(self.buffer) >= 44100 * 5: # 5 seconds of audio
st.audio(self.buffer, format="audio/wav")
audio_text = recognize_speech_with_vad(self.buffer)
if audio_text:
st.success("Frase detectada. Procesando audio...")
output, audio_file = generate(audio_text, history=st.session_state.history)
if audio_file is not None:
play(audio_file)
# Desactiva el VAD después de detectar una frase
self.vad_active = False
self.buffer = np.zeros((0,))
# Preparando entrada para el modelo de lenguaje
def format_prompt(message, history):
prompt = "<s>"
for user_prompt, bot_response in history:
prompt += f"[INST] {user_prompt} [/INST]"
prompt += f" {bot_response}</s> "
prompt += f"[INST] {message} [/INST]"
return prompt
# Generando respuesta en texto
def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
temperature = float(temperature) if temperature is not None else 0.9
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
seed=42,
)
formatted_prompt = format_prompt(audio_text, history)
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
response = ""
for response_token in stream:
response += response_token.token.text
response = ' '.join(response.split()).replace('</s>', '')
audio_file = text_to_speech(response, speed=1.3)
return response, audio_file
# Texto a voz
def text_to_speech(text, speed=1.3):
tts = gTTS(text=text, lang='es')
audio_fp = io.BytesIO()
tts.write_to_fp(audio_fp)
audio_fp.seek(0)
audio = AudioSegment.from_file(audio_fp, format="mp3")
modified_speed_audio = audio.speedup(playback_speed=speed)
modified_audio_fp = io.BytesIO()
modified_speed_audio.export(modified_audio_fp, format="mp3")
modified_audio_fp.seek(0)
return modified_audio_fp
# Reproductor de texto a voz
def audio_player_markup(audio_file):
return f"""
<audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
"""
# Interfaz de usuario con streamlit_webrtc
def main():
st.title("Chatbot de Voz a Voz")
webrtc_ctx = webrtc_streamer(
key="vad",
audio_processor_factory=VADProcessor,
async_processing=True,
media_stream_constraints={"video": False, "audio": True},
)
if __name__ == "__main__":
main() |