File size: 3,590 Bytes
7d2c473
da45dce
 
 
 
882add6
 
5100159
882add6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b78309b
da45dce
 
c501864
da45dce
 
 
c501864
da45dce
 
8307fd0
da45dce
 
 
 
 
 
8307fd0
882add6
 
 
da45dce
 
 
 
8307fd0
da45dce
 
 
 
 
 
 
 
595f1c1
882add6
da45dce
 
8307fd0
da45dce
 
 
 
 
 
89ff019
882add6
 
da45dce
882add6
 
da45dce
882add6
da45dce
882add6
 
 
 
da45dce
882add6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import streamlit as st
import base64
import io
from huggingface_hub import InferenceClient
from gtts import gTTS
from audiorecorder import audiorecorder
import speech_recognition as sr

def recognize_speech(audio_data):
    st.info("Reconociendo audio...")

    recognizer = sr.Recognizer()
    audio_recording = sr.AudioFile(audio_data)

    try:
        with audio_recording as source:
            audio = recognizer.record(source)

        audio_text = recognizer.recognize_google(audio, language="es-ES")
        st.subheader("Texto Reconocido:")
        st.write(audio_text)

    except sr.UnknownValueError:
        st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
        audio_text = ""

    except sr.RequestError as e:
        st.error(f"Error en el reconocimiento de voz: {e}")
        audio_text = ""

    st.success("Reconocimiento de voz completado.")
    return audio_text

def format_prompt(message, history):
    prompt = "<s>"

    for user_prompt, bot_response in history:
        prompt += f"[INST] {user_prompt} [/INST]"
        prompt += f" {bot_response}</s> "

    prompt += f"[INST] {message} [/INST]"
    return prompt

def text_to_speech(text, speed=1.3):
    tts = gTTS(text=text, lang='es')
    audio_fp = io.BytesIO()
    tts.write_to_fp(audio_fp)
    audio_fp.seek(0)
    return audio_fp

def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
    client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")

    temperature = float(temperature) if temperature is not None else 0.9
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    formatted_prompt = format_prompt(audio_text, history)
    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
    response = ""

    for response_token in stream:
        response += response_token.token.text
    
    response = ' '.join(response.split()).replace('</s>', '')
    audio_file = text_to_speech(response, speed=1.3)
    return response, audio_file

def main():
    st.title("Grabación de Audio y Reconocimiento de Voz")

    if "history" not in st.session_state:
        st.session_state.history = []

    audio_data = audiorecorder("Habla para grabar", "Deteniendo la grabación...")

    if not audio_data.empty():
        st.audio(audio_data.export().read(), format="audio/wav")
        audio_data.export("audio.wav", format="wav")
        st.write(f"Frame rate: {audio_data.frame_rate}, Frame width: {audio_data.frame_width}, Duration: {audio_data.duration_seconds} seconds")

        audio_text = recognize_speech("audio.wav")
        
        output, audio_file = generate(audio_text, history=st.session_state.history)  
        st.text_area("Respuesta", height=100, value=output, key="output_text", disabled=True)
            
        if audio_text:  
            st.session_state.history.append((audio_text, output))  
            
        if audio_file is not None:
            st.markdown(
                f"""
                <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
                """,
                unsafe_allow_html=True
            )

if __name__ == "__main__":
    main()