xaman4 / app.py
salomonsky's picture
Update app.py
7a4f83c verified
raw
history blame
3.84 kB
import streamlit as st
import torch
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import pyaudio
import sounddevice as sd
from TTS.api import TTS
class VoiceAssistant:
def __init__(self):
# Cargar modelo Wav2Vec2 para reconocimiento de voz en espa帽ol
self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53-spanish")
self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53-spanish")
# Cargar modelo TTS para s铆ntesis de voz en espa帽ol (modelo corregido)
self.tts_model = TTS(model_name="microsoft/speecht5_tts", progress_bar=False)
# Par谩metros de audio
self.sample_rate = 16000
self.chunk_size = 480
self.p = pyaudio.PyAudio()
self.stream = self.p.open(format=pyaudio.paFloat32, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
# Palabras clave
self.keyword_activation = "jarvis"
self.keyword_deactivation = "detente"
# Estado de escucha
self.listening = False
def vad_collector(self, vad_threshold=0.5):
audio_chunks, keyword_detected = [], False
while self.listening:
data = self.stream.read(self.chunk_size)
audio_chunk = np.frombuffer(data, dtype=np.float32)
# Detectar palabra de activaci贸n
if self.keyword_activation.lower() in str(audio_chunk).lower():
keyword_detected = True
break
# Detectar palabra de desactivaci贸n
if self.keyword_deactivation.lower() in str(audio_chunk).lower():
self.listening = False
break
audio_chunks.append(audio_chunk)
return audio_chunks, keyword_detected
def transcribe_audio(self, audio_chunks):
audio_data = np.concatenate(audio_chunks)
# Procesar y transcribir el audio usando Wav2Vec2
input_values = self.processor(audio_data, return_tensors="pt", sampling_rate=self.sample_rate).input_values
with torch.no_grad():
logits = self.model(input_values).logits
# Decodificar la transcripci贸n
predicted_ids = torch.argmax(logits, dim=-1)
transcription = self.processor.decode(predicted_ids[0])
return transcription
def generate_response(self, text):
return "Respuesta generada para: " + text
def text_to_speech(self, text):
output_path = "response.wav"
self.tts_model.tts_to_file(text=text, file_path=output_path)
return output_path
def run(self):
st.title("Asistente de Voz JARVIS")
# Bot贸n para iniciar/desactivar la escucha
if st.button("Iniciar/Detener Escucha"):
self.listening = not self.listening
st.write("Escucha activada." if self.listening else "Escucha desactivada.")
# Realizar la transcripci贸n y s铆ntesis de voz si la escucha est谩 activada
if self.listening:
audio_chunks, keyword_detected = self.vad_collector()
if keyword_detected:
st.success("Palabra clave 'JARVIS' detectada. Procesando...")
transcribed_text = self.transcribe_audio(audio_chunks)
st.write(f"Texto transcrito: {transcribed_text}")
response = self.generate_response(transcribed_text)
st.write(f"Respuesta: {response}")
audio_path = self.text_to_speech(response)
st.audio(audio_path)
def main():
assistant = VoiceAssistant()
assistant.run()
if __name__ == "__main__":
main()