salomonsky commited on
Commit
27eb034
·
verified ·
1 Parent(s): ceacd81

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -48
app.py CHANGED
@@ -7,8 +7,8 @@ import streamlit as st
7
  import speech_recognition as sr
8
  from huggingface_hub import InferenceClient
9
  from streamlit_mic_recorder import mic_recorder
10
- import wave
11
  import webrtcvad
 
12
 
13
  temp_audio_file_path = "./output.wav"
14
 
@@ -23,26 +23,29 @@ pre_prompt_text = "eres una IA conductual, tus respuestas serán breves."
23
 
24
  def recognize_speech(audio_data, show_messages=True):
25
  recognizer = sr.Recognizer()
26
- audio_recording = sr.AudioFile(audio_data)
27
-
28
- with audio_recording as source:
29
- audio = recognizer.record(source)
30
 
31
- try:
32
- audio_text = recognizer.recognize_google(audio, language="es-ES")
33
- if show_messages:
34
- st.subheader("Texto Reconocido:")
35
- st.write(audio_text)
36
- st.success("Reconocimiento de voz completado.")
37
- except sr.UnknownValueError:
38
- st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
39
- audio_text = ""
40
- except sr.RequestError:
41
- st.error("Hablame para comenzar!")
42
- audio_text = ""
 
43
 
44
  return audio_text
45
 
 
 
 
 
 
 
46
  def format_prompt(message, history):
47
  prompt = "<s>"
48
 
@@ -93,38 +96,23 @@ def text_to_speech(text, speed=1.3):
93
  def audio_play(audio_fp):
94
  st.audio(audio_fp.read(), format="audio/mp3", start_time=0)
95
 
96
- def display_recognition_result(audio_text, output, audio_file):
97
- if audio_text:
98
- st.session_state.history.append((audio_text, output))
99
-
100
- if audio_file is not None:
101
- st.markdown(
102
- f"""<audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>""",
103
- unsafe_allow_html=True)
104
-
105
  def voice_activity_detection(audio_data):
106
  return vad.is_speech(audio_data, sample_rate)
107
 
108
  def start_stream():
109
- with wave.open(temp_audio_file_path, 'rb') as wave_file:
110
- frames = wave_file.readframes(wave_file.getnframes())
111
- audio_data = np.frombuffer(frames, dtype=np.int16)
112
- detection = voice_activity_detection(audio_data)
113
- print(detection)
114
-
115
- audio_text = recognize_speech(temp_audio_file_path)
116
- st.subheader("Texto Reconocido:")
117
- st.write(audio_text)
118
-
119
- class Threader(threading.Thread):
120
- def __init__(self, *args, **kwargs):
121
- threading.Thread.__init__(self, *args, **kwargs)
122
- self.start()
123
-
124
- def run(self):
125
- if self.name == 'mythread':
126
- print("Started mythread")
127
- start_stream()
128
 
129
  if __name__ == "__main__":
130
  # Cambios para usar streamlit_mic_recorder
@@ -133,21 +121,22 @@ if __name__ == "__main__":
133
 
134
  if audio:
135
  st.audio(audio['bytes'])
 
136
  audio_bytes = audio["bytes"]
137
  sample_width = audio["sample_width"]
138
  sample_rate = audio["sample_rate"]
139
- num_channels = 1
140
 
141
  with wave.open(temp_audio_file_path, 'wb') as wave_file:
142
- wave_file.setnchannels(num_channels)
143
  wave_file.setsampwidth(sample_width)
144
  wave_file.setframerate(sample_rate)
145
  wave_file.writeframes(audio_bytes)
146
 
147
  vad = webrtcvad.Vad(1)
 
148
  channels = [1]
149
  mapping = [c - 1 for c in channels]
150
- sample_rate = int(sr.AudioFile(temp_audio_file_path)._samples_per_second)
151
  interval_size = 10
152
  downsample = 1
153
  block_size = int(sample_rate * interval_size / 1000)
 
7
  import speech_recognition as sr
8
  from huggingface_hub import InferenceClient
9
  from streamlit_mic_recorder import mic_recorder
 
10
  import webrtcvad
11
+ import wave
12
 
13
  temp_audio_file_path = "./output.wav"
14
 
 
23
 
24
  def recognize_speech(audio_data, show_messages=True):
25
  recognizer = sr.Recognizer()
 
 
 
 
26
 
27
+ with io.BytesIO(audio_data) as audio_file:
28
+ try:
29
+ audio_text = recognizer.recognize_google(audio_file, language="es-ES")
30
+ if show_messages:
31
+ st.subheader("Texto Reconocido:")
32
+ st.write(audio_text)
33
+ st.success("Reconocimiento de voz completado.")
34
+ except sr.UnknownValueError:
35
+ st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
36
+ audio_text = ""
37
+ except sr.RequestError:
38
+ st.error("Hablame para comenzar!")
39
+ audio_text = ""
40
 
41
  return audio_text
42
 
43
+ def detect_silence(audio_data, silence_threshold=5000, silence_duration=5000):
44
+ is_silence = lambda x: max(x) < silence_threshold
45
+ chunks = [audio_data[i:i+silence_duration] for i in range(0, len(audio_data), silence_duration)]
46
+ silent_chunks = [chunk for chunk in chunks if is_silence(chunk)]
47
+ return silent_chunks
48
+
49
  def format_prompt(message, history):
50
  prompt = "<s>"
51
 
 
96
  def audio_play(audio_fp):
97
  st.audio(audio_fp.read(), format="audio/mp3", start_time=0)
98
 
 
 
 
 
 
 
 
 
 
99
  def voice_activity_detection(audio_data):
100
  return vad.is_speech(audio_data, sample_rate)
101
 
102
  def start_stream():
103
+ recognizer = sr.Recognizer()
104
+ with sr.AudioFile(temp_audio_file_path) as source:
105
+ audio = recognizer.record(source)
106
+
107
+ audio_data = np.frombuffer(audio.frame_data, dtype=np.int16)
108
+ # Detectar silencios en el audio
109
+ silent_chunks = detect_silence(audio_data)
110
+
111
+ for silent_chunk in silent_chunks:
112
+ # Obtener texto de cada fragmento silencioso
113
+ audio_text = recognize_speech(silent_chunk)
114
+ st.subheader("Texto Reconocido:")
115
+ st.write(audio_text)
 
 
 
 
 
 
116
 
117
  if __name__ == "__main__":
118
  # Cambios para usar streamlit_mic_recorder
 
121
 
122
  if audio:
123
  st.audio(audio['bytes'])
124
+
125
  audio_bytes = audio["bytes"]
126
  sample_width = audio["sample_width"]
127
  sample_rate = audio["sample_rate"]
 
128
 
129
  with wave.open(temp_audio_file_path, 'wb') as wave_file:
130
+ wave_file.setnchannels(1)
131
  wave_file.setsampwidth(sample_width)
132
  wave_file.setframerate(sample_rate)
133
  wave_file.writeframes(audio_bytes)
134
 
135
  vad = webrtcvad.Vad(1)
136
+
137
  channels = [1]
138
  mapping = [c - 1 for c in channels]
139
+ sample_rate = int(sample_rate)
140
  interval_size = 10
141
  downsample = 1
142
  block_size = int(sample_rate * interval_size / 1000)