Spaces:
Running
Running
File size: 3,635 Bytes
5bbd161 23076a4 3784695 56f4168 719c3e8 5bbd161 eeaa144 5bbd161 4773685 a75c652 8b2bfa8 5bbd161 f2c3ba6 5bbd161 f2c3ba6 5bbd161 686ef78 5bbd161 686ef78 f2c3ba6 686ef78 eeaa144 093b41a 3e64240 093b41a a75c652 093b41a a75c652 3e64240 8307fd0 23076a4 3625f99 23076a4 093b41a 23076a4 da45dce 8307fd0 da45dce 86a2629 da45dce 8f22654 da45dce 89ff019 8f22654 23076a4 464ced7 23076a4 3784695 b39afbd 86a2629 5bbd161 b39afbd c8a52a4 7ce7fe0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import streamlit as st
import base64
import io
from huggingface_hub import InferenceClient
from gtts import gTTS
from audiorecorder import audiorecorder
import speech_recognition as sr
from pydub import AudioSegment
if "history" not in st.session_state:
st.session_state.history = []
def recognize_speech(audio_data, show_messages=True):
recognizer = sr.Recognizer()
audio_recording = sr.AudioFile(audio_data)
with audio_recording as source:
audio = recognizer.record(source)
try:
audio_text = recognizer.recognize_google(audio, language="es-ES")
if show_messages:
st.subheader("Texto Reconocido:")
st.write(audio_text)
st.success("Reconocimiento de voz completado.")
except sr.UnknownValueError:
st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
audio_text = ""
except sr.RequestError:
st.error("Hablame para comenzar!")
audio_text = ""
return audio_text
def format_prompt(message, history):
prompt = "<s>"
for user_prompt, bot_response in history:
prompt += f"[INST] {user_prompt} [/INST]"
prompt += f" {bot_response}</s> "
prompt += f"[INST] {message} [/INST]"
return prompt
def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
temperature = float(temperature) if temperature is not None else 0.9
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
seed=42,
)
formatted_prompt = format_prompt(audio_text, history)
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
response = ""
for response_token in stream:
response += response_token.token.text
response = ' '.join(response.split()).replace('</s>', '')
audio_file = text_to_speech(response, speed=1.3)
return response, audio_file
def text_to_speech(text, speed=1.3):
tts = gTTS(text=text, lang='es')
audio_fp = io.BytesIO()
tts.write_to_fp(audio_fp)
audio_fp.seek(0)
audio = AudioSegment.from_file(audio_fp, format="mp3")
modified_speed_audio = audio.speedup(playback_speed=speed)
modified_audio_fp = io.BytesIO()
modified_speed_audio.export(modified_audio_fp, format="mp3")
modified_audio_fp.seek(0)
return modified_audio_fp
def main():
st.title("Chatbot de Voz a Voz")
audio_data = audiorecorder("Habla para grabar", "Deteniendo la grabación...")
if not audio_data.empty():
st.audio(audio_data.export().read(), format="audio/wav")
audio_data.export("audio.wav", format="wav")
audio_text = recognize_speech("audio.wav")
if audio_text:
output, audio_file = generate(audio_text, history=st.session_state.history)
if audio_text:
st.session_state.history.append((audio_text, output))
if audio_file is not None:
st.markdown(
f"""
<audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
""",
unsafe_allow_html=True
)
if __name__ == "__main__":
main() |