salomonsky commited on
Commit
8f22654
verified
1 Parent(s): ae5f7c4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -36
app.py CHANGED
@@ -3,19 +3,22 @@ import base64
3
  import io
4
  from huggingface_hub import InferenceClient
5
  from gtts import gTTS
6
- import speech_recognition as sr
7
  from audiorecorder import audiorecorder
 
 
8
 
9
  if "history" not in st.session_state:
10
  st.session_state.history = []
11
 
12
  def recognize_speech(audio_data, show_messages=True):
13
  recognizer = sr.Recognizer()
14
- audio = sr.AudioData(audio_data, sample_rate=16000)
 
 
 
15
 
16
  try:
17
- # Eliminar la clave de API de Google Cloud Speech
18
- audio_text = recognizer.recognize_google(audio, language="es-us")
19
  if show_messages:
20
  st.subheader("Texto Reconocido:")
21
  st.write(audio_text)
@@ -54,8 +57,6 @@ def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.
54
  repetition_penalty=repetition_penalty,
55
  do_sample=True,
56
  seed=42,
57
- # Habilitar el uso de la cach茅 del modelo
58
- use_cache=True
59
  )
60
 
61
  formatted_prompt = format_prompt(audio_text, history)
@@ -66,37 +67,47 @@ def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.
66
  response += response_token.token.text
67
 
68
  response = ' '.join(response.split()).replace('</s>', '')
69
- audio_file = text_to_speech(response)
70
  return response, audio_file
71
 
72
- def text_to_speech(text):
73
- tts = gTTS(text, lang="es", tld="io", name="Alvaro")
74
- audio_file = io.BytesIO()
75
- tts.write_to_fp(audio_file)
76
- audio_file.seek(0)
77
- return audio_file
78
-
79
- def background_process(audio_data):
80
- audio_text = recognize_speech(audio_data, show_messages=False)
81
- if audio_text:
82
- output, audio_file = generate(audio_text, history=st.session_state.history)
83
- st.session_state.history.append((audio_text, output))
84
- st.markdown(f"## Respuesta generada:\n{output}")
85
- st.audio(audio_file, format="audio/mp3")
86
 
87
  def main():
88
- audio_recorder = audiorecorder(dur=10, filename="audio.wav", listen_background=background_process, noise_reduction=True)
89
- st.markdown("## Grabar audio")
90
- st.markdown("Presiona el bot贸n de abajo para grabar tu voz durante 10 segundos.")
91
- record_state = audio_recorder.record()
92
- if record_state:
93
- st.info("Grabando...")
94
- else:
95
- st.success("Grabaci贸n completada.")
96
- st.markdown("## Reproducir audio")
97
- st.markdown("Presiona el bot贸n de abajo para reproducir el audio grabado.")
98
- play_state = audio_recorder.play()
99
- if play_state:
100
- st.info("Reproduciendo...")
101
- else:
102
- st.success("Reproducci贸n finalizada.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import io
4
  from huggingface_hub import InferenceClient
5
  from gtts import gTTS
 
6
  from audiorecorder import audiorecorder
7
+ import speech_recognition as sr
8
+ from pydub import AudioSegment
9
 
10
  if "history" not in st.session_state:
11
  st.session_state.history = []
12
 
13
  def recognize_speech(audio_data, show_messages=True):
14
  recognizer = sr.Recognizer()
15
+ audio_recording = sr.AudioFile(audio_data)
16
+
17
+ with audio_recording as source:
18
+ audio = recognizer.record(source)
19
 
20
  try:
21
+ audio_text = recognizer.recognize_google(audio, language="es-ES")
 
22
  if show_messages:
23
  st.subheader("Texto Reconocido:")
24
  st.write(audio_text)
 
57
  repetition_penalty=repetition_penalty,
58
  do_sample=True,
59
  seed=42,
 
 
60
  )
61
 
62
  formatted_prompt = format_prompt(audio_text, history)
 
67
  response += response_token.token.text
68
 
69
  response = ' '.join(response.split()).replace('</s>', '')
70
+ audio_file = text_to_speech(response, speed=1.3)
71
  return response, audio_file
72
 
73
+ def text_to_speech(text, speed=1.3):
74
+ tts = gTTS(text=text, lang='es')
75
+ audio_fp = io.BytesIO()
76
+ tts.write_to_fp(audio_fp)
77
+ audio_fp.seek(0)
78
+ audio = AudioSegment.from_file(audio_fp, format="mp3")
79
+ modified_speed_audio = audio.speedup(playback_speed=speed)
80
+ modified_audio_fp = io.BytesIO()
81
+ modified_speed_audio.export(modified_audio_fp, format="mp3")
82
+ modified_audio_fp.seek(0)
83
+ return modified_audio_fp
 
 
 
84
 
85
  def main():
86
+ audio_data = audiorecorder("Habla para grabar", "Deteniendo la grabaci贸n...")
87
+
88
+ if not audio_data.empty():
89
+ st.audio(audio_data.export().read(), format="audio/wav")
90
+ audio_data.export("audio.wav", format="wav")
91
+ audio_text = recognize_speech("audio.wav")
92
+
93
+ if not st.session_state.history:
94
+ pre_prompt = "Te Llamar谩s Chaman 4.0 y tus respuestas ser谩n sumamente breves."
95
+ output, _ = generate(pre_prompt, history=st.session_state.history)
96
+ st.session_state.history.append((pre_prompt, output))
97
+
98
+ if audio_text:
99
+ output, audio_file = generate(audio_text, history=st.session_state.history)
100
+
101
+ if audio_text:
102
+ st.session_state.history.append((audio_text, output))
103
+
104
+ if audio_file is not None:
105
+ st.markdown(
106
+ f"""
107
+ <audio autoplay="autoplay" controls="controls" src="data:audio/mp3;base64,{base64.b64encode(audio_file.read()).decode()}" type="audio/mp3" id="audio_player"></audio>
108
+ """,
109
+ unsafe_allow_html=True
110
+ )
111
+
112
+ if __name__ == "__main__":
113
+ main()