Spaces:
Sleeping
Sleeping
File size: 3,388 Bytes
a202e44 d4cfc35 a8fb390 e89f269 3db068f e89f269 5bb1a10 88f6f66 e89f269 d67f0a9 2dc98a7 0f213dd bdccd83 2a43b85 bdccd83 173c390 bdccd83 47759f3 2a43b85 bdccd83 a34685e bdccd83 a34685e b303f3d a34685e b303f3d a34685e bdccd83 b7431cd e89f269 a34685e bdccd83 a34685e fb054e7 a34685e e89f269 a34685e 376b54f 1ba9aea a34685e 376b54f 415313d 5bb1a10 8884768 5bb1a10 d4cfc35 5bb1a10 ec80b46 5bb1a10 ec80b46 bdccd83 5bb1a10 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import streamlit as st
import base64
import io
import time
from huggingface_hub import InferenceClient
from gtts import gTTS
from pydub import AudioSegment
import speech_recognition as sr
import streamlit_mic_recorder as smr
pre_prompt_text = ""
if "history" not in st.session_state:
st.session_state.history = []
if "pre_prompt_sent" not in st.session_state:
st.session_state.pre_prompt_sent = False
def recognize_speech(audio_data, show_messages=True):
recognizer = sr.Recognizer()
audio_recording = sr.AudioFile(audio_data)
with audio_recording as source:
audio = recognizer.record(source)
try:
audio_text = recognizer.recognize_google(audio, language="es-ES")
if show_messages:
st.subheader("Recognized text:")
st.write(audio_text)
st.success("Voice Recognized.")
except sr.UnknownValueError:
st.warning("The audio could not be recognized. Did you try to record something?")
audio_text = ""
except sr.RequestError:
st.error("Push/Talk to start!")
audio_text = ""
return audio_text
def format_prompt(message, history):
prompt = "<s>"
if not st.session_state.pre_prompt_sent:
prompt += f"[INST] {pre_prompt_text} [/INST]"
st.session_state.pre_prompt_sent = True
for user_prompt, bot_response in history:
prompt += f"[INST] {user_prompt} [/INST]"
prompt += f" {bot_response}</s> "
prompt += f"[INST] {message} [/INST]"
return prompt
def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
temperature = float(temperature) if temperature is not None else 0.9
temperature = max(temperature, 1e-2)
top_p = float(top_p)
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
seed=42)
formatted_prompt = format_prompt(audio_text, history)
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
response = ""
for response_token in stream:
response += response_token.token.text
response = ' '.join(response.split()).replace('</s>', '')
audio_file = text_to_speech(response, speed=1.3)
return response, audio_file
def text_to_speech(text):
tts = gTTS(text=text, lang='es')
audio_fp = io.BytesIO()
tts.write_to_fp(audio_fp)
audio_fp.seek(0)
return audio_fp
def main():
st.write("Di la palabra XAMAN para empezar o DETENTE para procesar")
recording = st.empty()
st.write(smr.audio_recorder())
if st.button("Procesar"):
audio_data = smr.get_audio()
st.audio(audio_data, format="audio/wav")
audio_text = recognize_speech(audio_data)
if audio_text:
output, audio_file = generate(audio_text)
if audio_file is not None:
st.markdown(
f"""<audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>""",
unsafe_allow_html=True)
if __name__ == "__main__":
main() |