File size: 4,355 Bytes
7d2c473
23076a4
3784695
56f4168
719c3e8
86a2629
eeaa144
 
 
 
 
23076a4
a75c652
 
8b2bfa8
f2c3ba6
 
72e1090
f2c3ba6
686ef78
 
f2c3ba6
686ef78
 
 
9ab5c4d
686ef78
 
 
 
 
 
f2c3ba6
686ef78
eeaa144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72e1090
093b41a
3e64240
093b41a
 
a75c652
 
093b41a
a75c652
3e64240
8307fd0
72e1090
23076a4
3625f99
 
23076a4
 
 
 
 
 
 
 
 
 
 
 
 
 
093b41a
23076a4
da45dce
8307fd0
da45dce
 
86a2629
da45dce
8f22654
da45dce
89ff019
72e1090
8f22654
 
23076a4
 
 
464ced7
23076a4
 
 
 
 
3784695
72e1090
f2c3ba6
 
 
 
 
eeaa144
b39afbd
86a2629
72e1090
eeaa144
 
 
 
 
 
b39afbd
c8a52a4
7ce7fe0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import streamlit as st
import base64
import io
from huggingface_hub import InferenceClient
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play
from streamlit_webrtc import webrtc_streamer, VideoProcessorBase
import cv2
import numpy as np
import speech_recognition as sr

if "history" not in st.session_state:
    st.session_state.history = []

recognizer = sr.Recognizer()

# Reconociendo voz con VAD
def recognize_speech_with_vad(audio_data, show_messages=True):
    try:
        audio_text = recognizer.recognize_google(audio_data, language="es-ES")

        if show_messages:
            st.subheader("Texto Reconocido:")
            st.write(audio_text)

    except sr.UnknownValueError:
        st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
        audio_text = ""
    except sr.RequestError:
        st.error("Hablame para comenzar!")
        audio_text = ""

    return audio_text

# Procesador de video para VAD con streamlit_webrtc
class VADProcessor(AudioProcessorBase):
    def __init__(self):
        self.buffer = np.zeros((0,))
        self.vad_active = True  

    def recv(self, audio_data):
        if self.vad_active:
            audio_array = np.frombuffer(audio_data, dtype=np.int16)
            self.buffer = np.concatenate((self.buffer, audio_array), axis=None)

            if len(self.buffer) >= 44100 * 5:  # 5 seconds of audio
                st.audio(self.buffer, format="audio/wav")
                audio_text = recognize_speech_with_vad(self.buffer)

                if audio_text:
                    st.success("Frase detectada. Procesando audio...")
                    output, audio_file = generate(audio_text, history=st.session_state.history)

                    if audio_file is not None:
                        play(audio_file)

                    # Desactiva el VAD después de detectar una frase
                    self.vad_active = False

                self.buffer = np.zeros((0,))


# Preparando entrada para el modelo de lenguaje
def format_prompt(message, history):
    prompt = "<s>"

    for user_prompt, bot_response in history:
        prompt += f"[INST] {user_prompt} [/INST]"
        prompt += f" {bot_response}</s> "

    prompt += f"[INST] {message} [/INST]"
    return prompt

# Generando respuesta en texto
def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
    client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")

    temperature = float(temperature) if temperature is not None else 0.9
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    formatted_prompt = format_prompt(audio_text, history)
    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
    response = ""

    for response_token in stream:
        response += response_token.token.text
    
    response = ' '.join(response.split()).replace('</s>', '')
    audio_file = text_to_speech(response, speed=1.3)
    return response, audio_file

# Texto a voz
def text_to_speech(text, speed=1.3):
    tts = gTTS(text=text, lang='es')
    audio_fp = io.BytesIO()
    tts.write_to_fp(audio_fp)
    audio_fp.seek(0)
    audio = AudioSegment.from_file(audio_fp, format="mp3")
    modified_speed_audio = audio.speedup(playback_speed=speed)
    modified_audio_fp = io.BytesIO()
    modified_speed_audio.export(modified_audio_fp, format="mp3")
    modified_audio_fp.seek(0)
    return modified_audio_fp

# Reproductor de texto a voz
def audio_player_markup(audio_file):
    return f"""
        <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
    """

# Interfaz de usuario con streamlit_webrtc
def main():
    st.title("Chatbot de Voz a Voz")

    webrtc_ctx = webrtc_streamer(
        key="vad",
        audio_processor_factory=VADProcessor,
        async_processing=True,
        media_stream_constraints={"video": False, "audio": True}, 
    )

if __name__ == "__main__":
    main()