salomonsky commited on
Commit
464ced7
verified
1 Parent(s): 9ab5c4d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -63
app.py CHANGED
@@ -1,50 +1,23 @@
1
  import streamlit as st
2
- import webrtcvad
3
- import pydub
4
- import numpy as np
5
  import base64
6
  import io
7
  from huggingface_hub import InferenceClient
8
  from gtts import gTTS
 
9
  import speech_recognition as sr
 
10
 
11
  st.title("Chatbot de Voz a Voz")
12
 
13
  if "history" not in st.session_state:
14
  st.session_state.history = []
15
 
16
- vad = webrtcvad.Vad()
17
- vad.set_mode(3)
18
-
19
- buffer = []
20
- frames_per_buffer = 480
21
- audio_rate = 16000
22
- channels = 1
23
- seconds_per_frame = frames_per_buffer / audio_rate
24
- vad_threshold = 0.5
25
-
26
- #abrir microfono
27
- def callback(data):
28
- try:
29
- audio_array = np.frombuffer(data, dtype=np.int16)
30
- is_speech = vad.is_speech(data, sample_rate=audio_rate)
31
-
32
- buffer.append(audio_array)
33
-
34
- if not is_speech:
35
- save_audio_buffer()
36
-
37
- except Exception as e:
38
- st.error(f"Error durante la captura de audio: {e}")
39
-
40
- # voz a texto
41
  def transcribe_audio(audio_data):
42
  recognizer = sr.Recognizer()
43
- audio_chunk = sr.AudioData(audio_data, sample_rate=audio_rate, sample_width=2) # 16-bit PCM audio
44
  text = recognizer.recognize_google(audio_chunk, language="es-ES")
45
  return text
46
 
47
- # entrada al modelo de lenguaje
48
  def format_prompt(message, history):
49
  prompt = "<s>"
50
 
@@ -55,7 +28,6 @@ def format_prompt(message, history):
55
  prompt += f"[INST] {message} [/INST]"
56
  return prompt
57
 
58
- #generaci贸n de respuesta
59
  def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
60
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
61
 
@@ -84,50 +56,35 @@ def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.
84
  audio_file = text_to_speech(response, speed=1.3)
85
  return response, audio_file
86
 
87
- #respuesta texto a voz
88
  def text_to_speech(text, speed=1.3):
89
  tts = gTTS(text=text, lang='es')
90
  audio_fp = io.BytesIO()
91
  tts.write_to_fp(audio_fp)
92
  audio_fp.seek(0)
93
- audio = pydub.AudioSegment.from_file(audio_fp, format="mp3")
94
  modified_speed_audio = audio.speedup(playback_speed=speed)
95
  modified_audio_fp = io.BytesIO()
96
  modified_speed_audio.export(modified_audio_fp, format="mp3")
97
  modified_audio_fp.seek(0)
98
  return modified_audio_fp
99
 
100
- #captura de audio
101
- def save_audio_buffer():
102
- if buffer:
103
- audio_array = np.concatenate(buffer)
104
- audio_segment = pydub.AudioSegment(
105
- audio_array.tobytes(),
106
- frame_rate=audio_rate,
107
- sample_width=audio_array.dtype.itemsize,
108
- channels=channels,
109
- )
110
-
111
- st.audio(audio_array, format="audio/wav", channels=channels)
112
- transcribed_text = transcribe_audio(audio_array.tobytes())
113
- st.subheader("Texto Transcrito:")
114
- st.write(transcribed_text)
115
- output, audio_file = generate(transcribed_text, history=st.session_state.history)
116
-
117
- buffer.clear()
118
-
119
- #interfaz de usuario
120
  def main():
121
- st.title("Chatbot de Voz a Voz")
122
-
123
- st._webrtc_audio_recorder(
124
- key="audio",
125
- sample_rate=audio_rate,
126
- channels=channels,
127
- format="pcm",
128
- on_data=callback,
129
- start_streaming=st.button("Iniciar/Detener Grabaci贸n"),
130
- )
 
 
 
 
 
 
131
 
132
  if __name__ == "__main__":
133
  main()
 
1
  import streamlit as st
 
 
 
2
  import base64
3
  import io
4
  from huggingface_hub import InferenceClient
5
  from gtts import gTTS
6
+ import microphone as mic
7
  import speech_recognition as sr
8
+ from pydub import AudioSegment
9
 
10
  st.title("Chatbot de Voz a Voz")
11
 
12
  if "history" not in st.session_state:
13
  st.session_state.history = []
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def transcribe_audio(audio_data):
16
  recognizer = sr.Recognizer()
17
+ audio_chunk = sr.AudioData(audio_data, sample_rate=16000, sample_width=2) # 16-bit PCM audio
18
  text = recognizer.recognize_google(audio_chunk, language="es-ES")
19
  return text
20
 
 
21
  def format_prompt(message, history):
22
  prompt = "<s>"
23
 
 
28
  prompt += f"[INST] {message} [/INST]"
29
  return prompt
30
 
 
31
  def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
32
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
33
 
 
56
  audio_file = text_to_speech(response, speed=1.3)
57
  return response, audio_file
58
 
 
59
  def text_to_speech(text, speed=1.3):
60
  tts = gTTS(text=text, lang='es')
61
  audio_fp = io.BytesIO()
62
  tts.write_to_fp(audio_fp)
63
  audio_fp.seek(0)
64
+ audio = AudioSegment.from_file(audio_fp, format="mp3")
65
  modified_speed_audio = audio.speedup(playback_speed=speed)
66
  modified_audio_fp = io.BytesIO()
67
  modified_speed_audio.export(modified_audio_fp, format="mp3")
68
  modified_audio_fp.seek(0)
69
  return modified_audio_fp
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  def main():
72
+ with mic.recorder(samplerate=16000) as recorder:
73
+ recording = st.button("Iniciar/Detener Grabaci贸n")
74
+ if recording:
75
+ st.info("Grabando...")
76
+ audio_data = recorder.record(numframes=16000 * 7)
77
+ st.success("Grabaci贸n detenida.")
78
+ transcribed_text = transcribe_audio(audio_data)
79
+ st.subheader("Texto Transcrito:")
80
+ st.write(transcribed_text)
81
+ output, audio_file = generate(transcribed_text, history=st.session_state.history)
82
+ st.markdown(
83
+ f"""
84
+ <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
85
+ """,
86
+ unsafe_allow_html=True
87
+ )
88
 
89
  if __name__ == "__main__":
90
  main()