Spaces:
Running
Running
import tempfile | |
import webrtcvad | |
import speech_recognition as sr | |
import numpy as np | |
import streamlit as st | |
import sounddevice as sd | |
from traceback import format_exc | |
def update_vad_status(status): | |
vad_status.text(status) | |
def process_microphone_input(): | |
# Configuramos la tasa de muestreo y el tamaño del frame | |
sample_rate = 16000 | |
frame_size = 30 | |
chunk_size = 1024 # Adjust as needed for responsiveness | |
# Creamos un objeto VAD y un reconocedor de voz | |
vad = webrtcvad.Vad() | |
recognizer = sr.Recognizer() | |
# Indicadores de estado | |
vad_active = False | |
speech_detected = False | |
phrase = "" | |
try: | |
# Configuramos la captura de audio desde el micrófono | |
with sd.InputStream(callback=callback, channels=1, dtype=np.int16): | |
st.warning("Habla y observa los cambios en tiempo real...") | |
# Mantenemos la aplicación en ejecución | |
st.experimental_rerun() | |
except Exception as e: | |
st.error(f"Error during microphone input: {e}") | |
st.error("Traceback:") | |
st.error(format_exc()) | |
def callback(indata, frames, time, status): | |
if status: | |
print(f"Error in callback: {status}") | |
return | |
# Procesamos el audio en chunks | |
for i in range(0, len(indata), chunk_size): | |
chunk = indata[i:i + chunk_size] | |
# Procesamos cada chunk en frames | |
for j in range(0, len(chunk), frame_size): | |
# Obtenemos el frame actual | |
frame = chunk[j:j + frame_size] | |
# Detectamos si hay voz en el frame | |
is_speech = vad.is_speech(frame, sample_rate) | |
# Actualizamos los indicadores de estado | |
if is_speech and not vad_active: | |
vad_active = True | |
speech_detected = True | |
update_vad_status("️ Detección de voz iniciada") | |
elif not is_speech and vad_active: | |
vad_active = False | |
update_vad_status("⏹️ Detección de voz finalizada") | |
# Si se ha detectado voz y hay un silencio, transcribimos la frase | |
if speech_detected and not is_speech: | |
# Transcribimos la frase | |
with sr.AudioData(frame.tobytes(), sample_rate) as source: | |
audio = recognizer.record(source) | |
try: | |
text = recognizer.recognize_google(audio) | |
phrase += f" {text}" | |
st.text(f"️ {text}") | |
except sr.RequestError: | |
st.error("⚠️ Error al transcribir la frase - RequestError") | |
except sr.UnknownValueError: | |
st.error("⚠️ No se ha reconocido la frase - UnknownValueError") | |
# Reiniciamos el indicador de frase | |
speech_detected = False | |
# Imprimimos la frase completa | |
st.success(f"Transcripción completa: {phrase}") | |
# Streamlit UI | |
st.title("VAD and Speech Recognition App (Microphone Input)") | |
vad_status = st.empty() | |
process_microphone_input() | |