salomonsky commited on
Commit
574c2e1
·
verified ·
1 Parent(s): cfa9e19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -26
app.py CHANGED
@@ -40,6 +40,28 @@ def recognize_speech(audio_data, show_messages=True):
40
 
41
  return audio_text
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def detect_silence(audio_data, silence_threshold=5000, silence_duration=5000):
44
  is_silence = lambda x: max(x) < silence_threshold
45
  chunks = [audio_data[i:i+silence_duration] for i in range(0, len(audio_data), silence_duration)]
@@ -96,6 +118,15 @@ def text_to_speech(text, speed=1.3):
96
  def audio_play(audio_fp):
97
  st.audio(audio_fp.read(), format="audio/mp3", start_time=0)
98
 
 
 
 
 
 
 
 
 
 
99
  def voice_activity_detection(audio_data):
100
  return vad.is_speech(audio_data, sample_rate)
101
 
@@ -105,36 +136,30 @@ def start_stream():
105
  audio = recognizer.record(source)
106
 
107
  audio_data = np.frombuffer(audio.frame_data, dtype=np.int16)
108
- # Detectar silencios en el audio
109
  silent_chunks = detect_silence(audio_data)
110
 
111
  for silent_chunk in silent_chunks:
112
- # Obtener texto de cada fragmento silencioso
113
  audio_text = recognize_speech(silent_chunk)
114
  st.subheader("Texto Reconocido:")
115
  st.write(audio_text)
116
 
117
- if __name__ == "__main__":
118
- # Cambios para usar streamlit_mic_recorder
119
- st.write("Record your voice, and play the recorded audio:")
120
- audio = mic_recorder(start_prompt="▶️", stop_prompt="🛑", key='recorder')
121
-
122
- if audio:
123
- st.audio(audio['bytes'])
124
-
125
- audio_bytes = audio["bytes"]
126
- sample_width = audio["sample_width"]
127
- sample_rate = audio["sample_rate"]
128
-
129
- with wave.open(temp_audio_file_path, 'wb') as wave_file:
130
- wave_file.setnchannels(1)
131
- wave_file.setsampwidth(sample_width)
132
- wave_file.setframerate(sample_rate)
133
- wave_file.writeframes(audio_bytes)
134
-
135
- vad = webrtcvad.Vad(1)
136
-
137
- channels = [1]
138
- mapping = [c - 1 for c in channels]
139
- interval_size = 10
140
- downsample = 1
 
40
 
41
  return audio_text
42
 
43
+ def start_stream():
44
+ recognizer = sr.Recognizer()
45
+ with sr.AudioFile(temp_audio_file_path) as source:
46
+ audio = recognizer.record(source)
47
+
48
+ audio_data = np.frombuffer(audio.frame_data, dtype=np.int16)
49
+ silent_chunks = detect_silence(audio_data)
50
+
51
+ for silent_chunk in silent_chunks:
52
+ audio_text = recognize_speech(silent_chunk)
53
+ st.subheader("Texto Reconocido:")
54
+ st.write(audio_text)
55
+
56
+ # Actualiza el historial con el texto reconocido
57
+ st.session_state.history.append((audio_text, ""))
58
+
59
+ # Genera la respuesta de texto a voz usando el texto reconocido como prompt
60
+ response, audio_file = generate(audio_text, st.session_state.history)
61
+
62
+ # Muestra la respuesta y reproduce el audio
63
+ display_recognition_result(audio_text, response, audio_file)
64
+
65
  def detect_silence(audio_data, silence_threshold=5000, silence_duration=5000):
66
  is_silence = lambda x: max(x) < silence_threshold
67
  chunks = [audio_data[i:i+silence_duration] for i in range(0, len(audio_data), silence_duration)]
 
118
  def audio_play(audio_fp):
119
  st.audio(audio_fp.read(), format="audio/mp3", start_time=0)
120
 
121
+ def display_recognition_result(audio_text, output, audio_file):
122
+ if audio_text:
123
+ st.session_state.history.append((audio_text, output))
124
+
125
+ if audio_file is not None:
126
+ st.markdown(
127
+ f"""<audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>""",
128
+ unsafe_allow_html=True)
129
+
130
  def voice_activity_detection(audio_data):
131
  return vad.is_speech(audio_data, sample_rate)
132
 
 
136
  audio = recognizer.record(source)
137
 
138
  audio_data = np.frombuffer(audio.frame_data, dtype=np.int16)
 
139
  silent_chunks = detect_silence(audio_data)
140
 
141
  for silent_chunk in silent_chunks:
 
142
  audio_text = recognize_speech(silent_chunk)
143
  st.subheader("Texto Reconocido:")
144
  st.write(audio_text)
145
 
146
+ audio = mic_recorder(start_prompt="▶️", stop_prompt="🛑", key='recorder')
147
+
148
+ if audio:
149
+ st.audio(audio['bytes'])
150
+
151
+ audio_bytes = audio["bytes"]
152
+ sample_width = audio["sample_width"]
153
+ sample_rate = audio["sample_rate"]
154
+
155
+ with wave.open(temp_audio_file_path, 'wb') as wave_file:
156
+ wave_file.setnchannels(1)
157
+ wave_file.setsampwidth(sample_width)
158
+ wave_file.setframerate(sample_rate)
159
+ wave_file.writeframes(audio_bytes)
160
+
161
+ vad = webrtcvad.Vad(1)
162
+ channels = [1]
163
+ mapping = [c - 1 for c in channels]
164
+ interval_size = 10
165
+ downsample = 1