salomonsky commited on
Commit
badb078
·
verified ·
1 Parent(s): 574c2e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -65
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import io
2
  import base64
3
- import threading
4
  import numpy as np
5
  from gtts import gTTS
6
  import streamlit as st
@@ -18,7 +17,6 @@ if "history" not in st.session_state:
18
  if "pre_prompt_sent" not in st.session_state:
19
  st.session_state.pre_prompt_sent = False
20
 
21
- gatherUsageStats = "false"
22
  pre_prompt_text = "eres una IA conductual, tus respuestas serán breves."
23
 
24
  def recognize_speech(audio_data, show_messages=True):
@@ -40,34 +38,6 @@ def recognize_speech(audio_data, show_messages=True):
40
 
41
  return audio_text
42
 
43
- def start_stream():
44
- recognizer = sr.Recognizer()
45
- with sr.AudioFile(temp_audio_file_path) as source:
46
- audio = recognizer.record(source)
47
-
48
- audio_data = np.frombuffer(audio.frame_data, dtype=np.int16)
49
- silent_chunks = detect_silence(audio_data)
50
-
51
- for silent_chunk in silent_chunks:
52
- audio_text = recognize_speech(silent_chunk)
53
- st.subheader("Texto Reconocido:")
54
- st.write(audio_text)
55
-
56
- # Actualiza el historial con el texto reconocido
57
- st.session_state.history.append((audio_text, ""))
58
-
59
- # Genera la respuesta de texto a voz usando el texto reconocido como prompt
60
- response, audio_file = generate(audio_text, st.session_state.history)
61
-
62
- # Muestra la respuesta y reproduce el audio
63
- display_recognition_result(audio_text, response, audio_file)
64
-
65
- def detect_silence(audio_data, silence_threshold=5000, silence_duration=5000):
66
- is_silence = lambda x: max(x) < silence_threshold
67
- chunks = [audio_data[i:i+silence_duration] for i in range(0, len(audio_data), silence_duration)]
68
- silent_chunks = [chunk for chunk in chunks if is_silence(chunk)]
69
- return silent_chunks
70
-
71
  def format_prompt(message, history):
72
  prompt = "<s>"
73
 
@@ -127,39 +97,40 @@ def display_recognition_result(audio_text, output, audio_file):
127
  f"""<audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>""",
128
  unsafe_allow_html=True)
129
 
130
- def voice_activity_detection(audio_data):
131
- return vad.is_speech(audio_data, sample_rate)
132
-
133
- def start_stream():
134
- recognizer = sr.Recognizer()
135
- with sr.AudioFile(temp_audio_file_path) as source:
136
- audio = recognizer.record(source)
137
-
138
- audio_data = np.frombuffer(audio.frame_data, dtype=np.int16)
139
- silent_chunks = detect_silence(audio_data)
140
-
141
- for silent_chunk in silent_chunks:
142
- audio_text = recognize_speech(silent_chunk)
143
- st.subheader("Texto Reconocido:")
144
- st.write(audio_text)
145
-
146
- audio = mic_recorder(start_prompt="▶️", stop_prompt="🛑", key='recorder')
147
-
148
- if audio:
149
- st.audio(audio['bytes'])
150
-
151
- audio_bytes = audio["bytes"]
152
- sample_width = audio["sample_width"]
153
- sample_rate = audio["sample_rate"]
154
-
155
- with wave.open(temp_audio_file_path, 'wb') as wave_file:
156
- wave_file.setnchannels(1)
157
- wave_file.setsampwidth(sample_width)
158
- wave_file.setframerate(sample_rate)
159
- wave_file.writeframes(audio_bytes)
160
 
161
- vad = webrtcvad.Vad(1)
162
- channels = [1]
163
- mapping = [c - 1 for c in channels]
164
- interval_size = 10
165
- downsample = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import io
2
  import base64
 
3
  import numpy as np
4
  from gtts import gTTS
5
  import streamlit as st
 
17
  if "pre_prompt_sent" not in st.session_state:
18
  st.session_state.pre_prompt_sent = False
19
 
 
20
  pre_prompt_text = "eres una IA conductual, tus respuestas serán breves."
21
 
22
  def recognize_speech(audio_data, show_messages=True):
 
38
 
39
  return audio_text
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def format_prompt(message, history):
42
  prompt = "<s>"
43
 
 
97
  f"""<audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>""",
98
  unsafe_allow_html=True)
99
 
100
+ def detect_silence(audio_data, sample_rate, silence_threshold=5000, silence_duration=5000):
101
+ is_silence = lambda x: max(x) < silence_threshold
102
+ chunks = [audio_data[i:i+silence_duration] for i in range(0, len(audio_data), silence_duration)]
103
+ silent_chunks = [chunk for chunk in chunks if is_silence(chunk)]
104
+ return silent_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ def main():
107
+ if not st.session_state.pre_prompt_sent:
108
+ st.session_state.pre_prompt_sent = True
109
+
110
+ audio = mic_recorder(start_prompt="▶️", stop_prompt="🛑", key='recorder')
111
+
112
+ if audio:
113
+ st.audio(audio['bytes'], format="audio/wav")
114
+ audio_bytes = audio["bytes"]
115
+ sample_width = audio["sample_width"]
116
+ sample_rate = audio["sample_rate"]
117
+
118
+ with wave.open(temp_audio_file_path, 'wb') as wave_file:
119
+ wave_file.setnchannels(1)
120
+ wave_file.setsampwidth(sample_width)
121
+ wave_file.setframerate(sample_rate)
122
+ wave_file.writeframes(audio_bytes)
123
+
124
+ audio_data = np.frombuffer(audio_bytes, dtype=np.int16)
125
+ silent_chunks = detect_silence(audio_data, sample_rate)
126
+
127
+ for silent_chunk in silent_chunks:
128
+ audio_text = recognize_speech(silent_chunk)
129
+ st.subheader("Texto Reconocido:")
130
+ st.write(audio_text)
131
+ st.session_state.history.append((audio_text, ""))
132
+ response, audio_file = generate(audio_text, st.session_state.history)
133
+ display_recognition_result(audio_text, response, audio_file)
134
+
135
+ if __name__ == "__main__":
136
+ main()