xaman4

Sleeping

App Files Files Community

salomonsky commited on Jan 24

Commit

0ca4c37

verified ·

1 Parent(s): a835363

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -86

app.py CHANGED Viewed

@@ -1,86 +1,8 @@
-import gradio as gr
-import torch
-import numpy as np
-import json
-import pyaudio
-from huggingface_hub import hf_hub_download
-from vosk import Model, KaldiRecognizer
-from TTS.api import TTS
-from scipy.io import wavfile
-class VoiceAssistant:
-    def __init__(self):
-        self.vad_model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)
-        model_path = hf_hub_download(repo_id="alphacep/vosk-model-small-es", filename="model.zip")
-        self.vosk_model = Model(model_path)
-        self.tts_model = TTS(model_name="tts_models/es/css10/full-dataset", progress_bar=False)
-        self.sample_rate = 16000
-        self.chunk_size = 480
-        self.p = pyaudio.PyAudio()
-        self.stream = self.p.open(format=pyaudio.paFloat32, channels=1, rate=self.sample_rate, input=True, frames_per_buffer=self.chunk_size)
-        self.keyword = "jarvis"
-    def vad_collector(self, vad_threshold=0.5):
-        audio_chunks, keyword_detected = [], False
-        while True:
-            data = self.stream.read(self.chunk_size)
-            audio_chunk = np.frombuffer(data, dtype=np.float32)
-            speech_prob = self.vad_model(torch.from_numpy(audio_chunk), self.sample_rate).item()
-            if speech_prob > vad_threshold:
-                audio_chunks.append(audio_chunk)
-                recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
-                recognizer.AcceptWaveform(audio_chunk.tobytes())
-                result = json.loads(recognizer.Result())
-                if self.keyword.lower() in result.get('text', '').lower():
-                    keyword_detected = True
-                    break
-            if keyword_detected:
-                break
-        return audio_chunks, keyword_detected
-    def transcribe_audio(self, audio_chunks):
-        audio_data = np.concatenate(audio_chunks)
-        recognizer = KaldiRecognizer(self.vosk_model, self.sample_rate)
-        recognizer.AcceptWaveform(audio_data.tobytes())
-        result = json.loads(recognizer.Result())
-        return result.get('text', '')
-    def generate_response(self, text):
-        return "Respuesta generada para: " + text
-    def text_to_speech(self, text):
-        output_path = "response.wav"
-        self.tts_model.tts_to_file(text=text, file_path=output_path)
-        return output_path
-def process_audio():
-    assistant = VoiceAssistant()
-    audio_chunks, keyword_detected = assistant.vad_collector()
-    if keyword_detected:
-        transcribed_text = assistant.transcribe_audio(audio_chunks)
-        response = assistant.generate_response(transcribed_text)
-        audio_path = assistant.text_to_speech(response)
-        return transcribed_text, response, audio_path
-    else:
-        return "No se detectó la palabra clave.", "", ""
-iface = gr.Interface(
-    fn=process_audio,
-    inputs=[],
-    outputs=[
-        gr.Textbox(label="Texto Transcrito"),
-        gr.Textbox(label="Respuesta Generada"),
-        gr.Audio(label="Audio Generado")
-    ],
-    live=True,
-    title="Asistente de Voz JARVIS",
-    description="Presiona el botón para comenzar la escucha y decir 'JARVIS'."
-)
-if __name__ == "__main__":
-    iface.launch()

+streamlit==1.29.0
+torch==2.1.2
+numpy==1.22.0
+huggingface_hub==0.20.3
+transformers==4.36.2
+sounddevice==0.4.6
+TTS==0.22.0
+pyaudio==0.2.14