Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
from tempfile import NamedTemporaryFile
|
2 |
import streamlit as st
|
3 |
-
import base64
|
4 |
import io
|
5 |
from huggingface_hub import InferenceClient
|
6 |
from gtts import gTTS
|
|
|
7 |
import speech_recognition as sr
|
8 |
from pydub import AudioSegment
|
|
|
|
|
9 |
|
10 |
def recognize_speech(audio_data, show_messages=True):
|
11 |
recognizer = sr.Recognizer()
|
@@ -53,27 +55,32 @@ def generate(audio_text, history, generation_params):
|
|
53 |
audio_file = text_to_speech(response, speed=1.3)
|
54 |
return response, audio_file
|
55 |
|
56 |
-
def process_audio(frames, recognizer):
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
def text_to_speech(text, speed=1.3):
|
79 |
tts = gTTS(text=text, lang='es')
|
@@ -88,24 +95,26 @@ def text_to_speech(text, speed=1.3):
|
|
88 |
|
89 |
def main():
|
90 |
r = sr.Recognizer()
|
|
|
91 |
|
92 |
mic_list = sr.Microphone.list_microphone_names()
|
93 |
print("Dispositivos de micr贸fono encontrados:")
|
94 |
for i, microphone_name in enumerate(mic_list):
|
95 |
print(f"Dispositivo {i}: {microphone_name}")
|
96 |
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
1 |
from tempfile import NamedTemporaryFile
|
2 |
import streamlit as st
|
|
|
3 |
import io
|
4 |
from huggingface_hub import InferenceClient
|
5 |
from gtts import gTTS
|
6 |
+
import numpy as np
|
7 |
import speech_recognition as sr
|
8 |
from pydub import AudioSegment
|
9 |
+
import webrtcvad
|
10 |
+
import soundfile as sf
|
11 |
|
12 |
def recognize_speech(audio_data, show_messages=True):
|
13 |
recognizer = sr.Recognizer()
|
|
|
55 |
audio_file = text_to_speech(response, speed=1.3)
|
56 |
return response, audio_file
|
57 |
|
58 |
+
def process_audio(frames, recognizer, vad):
|
59 |
+
audio_array = np.frombuffer(frames, dtype=np.int16)
|
60 |
+
is_speech = vad.is_speech(audio_array, sample_rate=44100)
|
61 |
+
speech_frames = audio_array[is_speech]
|
62 |
+
|
63 |
+
if len(speech_frames) > 0:
|
64 |
+
audio_data = AudioSegment(
|
65 |
+
data=speech_frames.tobytes(),
|
66 |
+
sample_width=2,
|
67 |
+
frame_rate=44100,
|
68 |
+
channels=1
|
69 |
+
)
|
70 |
+
|
71 |
+
with NamedTemporaryFile(suffix=".wav") as temp_file:
|
72 |
+
audio_data.export(temp_file.name, format="wav")
|
73 |
+
audio_file = sr.AudioFile(temp_file.name)
|
74 |
+
|
75 |
+
with audio_file as source:
|
76 |
+
audio = recognizer.record(source)
|
77 |
+
try:
|
78 |
+
text = recognizer.recognize_google(audio, language="es-ES")
|
79 |
+
st.text_input("Input", value=text)
|
80 |
+
except sr.UnknownValueError:
|
81 |
+
pass
|
82 |
+
except sr.RequestError:
|
83 |
+
pass
|
84 |
|
85 |
def text_to_speech(text, speed=1.3):
|
86 |
tts = gTTS(text=text, lang='es')
|
|
|
95 |
|
96 |
def main():
|
97 |
r = sr.Recognizer()
|
98 |
+
vad = webrtcvad.Vad()
|
99 |
|
100 |
mic_list = sr.Microphone.list_microphone_names()
|
101 |
print("Dispositivos de micr贸fono encontrados:")
|
102 |
for i, microphone_name in enumerate(mic_list):
|
103 |
print(f"Dispositivo {i}: {microphone_name}")
|
104 |
|
105 |
+
selected_device_index = None
|
106 |
+
|
107 |
+
for i in range(len(mic_list)):
|
108 |
+
try:
|
109 |
+
with sr.Microphone(device_index=i) as source:
|
110 |
+
vad.set_mode(3) # Establecer el modo de VAD
|
111 |
+
print(f"Probando con el dispositivo {i}...")
|
112 |
+
r.adjust_for_ambient_noise(source, duration=1)
|
113 |
+
selected_device_index = i
|
114 |
+
break
|
115 |
+
except sr.RequestError as e:
|
116 |
+
print(f"No se pudo conectar con el dispositivo {i}: {e}")
|
117 |
+
except sr.UnknownValueError:
|
118 |
+
pass
|
119 |
+
|
120 |
+
if selected_device_index is not
|