File size: 3,927 Bytes
7d2c473
23076a4
3784695
56f4168
719c3e8
640fac3
3784695
8f22654
abc037b
 
23076a4
 
 
e9ba3a2
093b41a
5a44809
7c1fe58
 
 
 
 
 
 
 
 
5a44809
 
8f22654
093b41a
 
 
 
5a44809
 
093b41a
5a44809
7118b55
093b41a
 
 
 
 
24dca16
093b41a
 
24dca16
093b41a
24dca16
 
8307fd0
23076a4
3625f99
 
23076a4
 
 
 
 
 
 
 
 
 
 
 
 
 
093b41a
23076a4
da45dce
8307fd0
da45dce
 
 
 
8f22654
da45dce
89ff019
8f22654
 
23076a4
 
 
 
 
 
 
 
 
3784695
e9ba3a2
7256c21
a64dfc8
7256c21
 
 
 
 
 
640fac3
 
 
 
 
 
 
 
 
 
 
 
 
c8a52a4
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import streamlit as st
import base64
import io
from huggingface_hub import InferenceClient
from gtts import gTTS
from audiorecorder import audiorecorder
import speech_recognition as sr
from pydub import AudioSegment
from speech_recognition import Microphone, Recognizer
import pyttsx3

if "history" not in st.session_state:
    st.session_state.history = []

def recognize_speech(audio_data, show_messages=True):
    recognizer = sr.Recognizer()
    mic_list = sr.Microphone.list_microphone_names()
    if not mic_list:
        engine = pyttsx3.init()
        engine.say("No se encontraron micrófonos. Por favor, asegúrate de que estén conectados.")
        engine.runAndWait()
    else:
        print("Micrófonos encontrados:")
        for i, mic in enumerate(mic_list):
            print(f"{i+1}. {mic}")

    try:
        audio_text = recognizer.recognize_google(audio, language="es-ES")
        if show_messages:
            st.subheader("Texto Reconocido:")
            st.write(audio_text)
            st.success("Reconocimiento de voz completado.")
    except sr.UnknownValueError:
        st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
        audio_text = ""
    except sr.RequestError:
        st.error("Hablame para comenzar!")
        audio_text = ""

    return audio_text

def format_prompt(message, history):
    prompt_list = ["<s>"]

    for user_prompt, bot_response in history:
        prompt_list.extend([f"[INST] {user_prompt} [/INST]", f" {bot_response}</s> "])

    prompt_list.append(f"[INST] {message} [/INST]")
    return ''.join(prompt_list)

def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
    client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")

    temperature = float(temperature) if temperature is not None else 0.9
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    formatted_prompt = format_prompt(audio_text, history)
    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
    response = ""

    for response_token in stream:
        response += response_token.token.text
    
    response = ' '.join(response.split()).replace('</s>', '')
    audio_file = text_to_speech(response, speed=1.3)
    return response, audio_file

def text_to_speech(text, speed=1.3):
    tts = gTTS(text=text, lang='es')
    audio_fp = io.BytesIO()
    tts.write_to_fp(audio_fp)
    audio_fp.seek(0)
    audio = AudioSegment.from_file(audio_fp, format="mp3")
    modified_speed_audio = audio.speedup(playback_speed=speed)
    modified_audio_fp = io.BytesIO()
    modified_speed_audio.export(modified_audio_fp, format="mp3")
    modified_audio_fp.seek(0)
    return modified_audio_fp

def main():
    st.title("Chatbot de Voz a Voz")
    mic = sr.Microphone(device_index=-1)

    if not audio_data.empty():
        st.audio(audio_data.export().read(), format="audio/wav")
        audio_data.export("audio.wav", format="wav")
        audio_text = recognize_speech("audio.wav")

        if audio_text:
            output, audio_file = generate(audio_text, history=st.session_state.history)  

            if audio_text:  
                st.session_state.history.append((audio_text, output))  

            if audio_file is not None:
                st.markdown(
                    f"""
                    <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
                    """,
                    unsafe_allow_html=True
                )

if __name__ == "__main__":
    main()