salomonsky commited on
Commit
093b41a
·
verified ·
1 Parent(s): f6183ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -50
app.py CHANGED
@@ -1,42 +1,12 @@
1
  import streamlit as st
2
  import base64
3
  import io
4
- import pyaudio
5
- import wave
6
- import numpy as np
7
- from gtts import gTTS
8
  from huggingface_hub import InferenceClient
 
 
 
9
 
10
- def record_audio(filename="audio.wav", duration=5):
11
- st.subheader("Habla para grabar...")
12
-
13
- p = pyaudio.PyAudio()
14
- default_input_device_index = p.get_default_input_device_info()['index']
15
-
16
- stream = p.open(format=pyaudio.paInt16,
17
- channels=1,
18
- rate=44100,
19
- input=True,
20
- frames_per_buffer=1024)
21
-
22
- frames = []
23
-
24
- for i in range(0, int(44100 / 1024 * duration)):
25
- data = stream.read(1024)
26
- frames.append(data)
27
-
28
- stream.stop_stream()
29
- stream.close()
30
- p.terminate()
31
-
32
- wf = wave.open(filename, 'wb')
33
- wf.setnchannels(1)
34
- wf.setsampwidth(pyaudio.PyAudio().get_sample_size(pyaudio.paInt16))
35
- wf.setframerate(44100)
36
- wf.writeframes(b''.join(frames))
37
- wf.close()
38
-
39
- def recognize_speech(audio_data):
40
  recognizer = sr.Recognizer()
41
  audio_recording = sr.AudioFile(audio_data)
42
 
@@ -45,16 +15,28 @@ def recognize_speech(audio_data):
45
 
46
  try:
47
  audio_text = recognizer.recognize_google(audio, language="es-ES")
48
- st.subheader("Texto Reconocido:")
49
- st.write(audio_text)
50
- st.success("Reconocimiento de voz completado.")
51
- return audio_text
52
  except sr.UnknownValueError:
53
  st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
54
- return ""
55
  except sr.RequestError:
56
  st.error("No he recibido ningun audio. Por favor, inténtalo de nuevo.")
57
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
60
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
@@ -73,7 +55,7 @@ def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.
73
  seed=42,
74
  )
75
 
76
- formatted_prompt = f"<s>[INST] {audio_text} [/INST]</s>"
77
  stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
78
  response = ""
79
 
@@ -95,24 +77,23 @@ def main():
95
  if "history" not in st.session_state:
96
  st.session_state.history = []
97
 
98
- start_stop_button = st.button("Iniciar/Detener Detección")
99
 
100
- if start_stop_button:
101
- record_audio("audio.wav")
102
-
103
- st.audio("audio.wav", format="audio/wav")
104
  audio_text = recognize_speech("audio.wav")
105
 
106
  if not st.session_state.history:
107
  pre_prompt = "Te Llamarás Chaman 4.0 y tus respuestas serán sumamente breves."
108
  output, _ = generate(pre_prompt, history=st.session_state.history)
109
  st.session_state.history.append((pre_prompt, output))
110
-
111
  if audio_text:
112
- output, audio_file = generate(audio_text, history=st.session_state.history)
113
 
114
- if audio_text:
115
- st.session_state.history.append((audio_text, output))
116
 
117
  if audio_file is not None:
118
  st.markdown(
 
1
  import streamlit as st
2
  import base64
3
  import io
 
 
 
 
4
  from huggingface_hub import InferenceClient
5
+ from gtts import gTTS
6
+ from audiorecorder import audiorecorder
7
+ import speech_recognition as sr
8
 
9
+ def recognize_speech(audio_data, show_messages=True):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  recognizer = sr.Recognizer()
11
  audio_recording = sr.AudioFile(audio_data)
12
 
 
15
 
16
  try:
17
  audio_text = recognizer.recognize_google(audio, language="es-ES")
18
+ if show_messages:
19
+ st.subheader("Texto Reconocido:")
20
+ st.write(audio_text)
21
+ st.success("Reconocimiento de voz completado.")
22
  except sr.UnknownValueError:
23
  st.warning("No se pudo reconocer el audio. ¿Intentaste grabar algo?")
24
+ audio_text = ""
25
  except sr.RequestError:
26
  st.error("No he recibido ningun audio. Por favor, inténtalo de nuevo.")
27
+ audio_text = ""
28
+
29
+ return audio_text
30
+
31
+ def format_prompt(message, history):
32
+ prompt = "<s>"
33
+
34
+ for user_prompt, bot_response in history:
35
+ prompt += f"[INST] {user_prompt} [/INST]"
36
+ prompt += f" {bot_response}</s> "
37
+
38
+ prompt += f"[INST] {message} [/INST]"
39
+ return prompt
40
 
41
  def generate(audio_text, history, temperature=None, max_new_tokens=512, top_p=0.95, repetition_penalty=1.0):
42
  client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
 
55
  seed=42,
56
  )
57
 
58
+ formatted_prompt = format_prompt(audio_text, history)
59
  stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=True)
60
  response = ""
61
 
 
77
  if "history" not in st.session_state:
78
  st.session_state.history = []
79
 
80
+ audio_data = audiorecorder("Habla para grabar", "Deteniendo la grabación...")
81
 
82
+ if not audio_data.empty():
83
+ st.audio(audio_data.export().read(), format="audio/wav")
84
+ audio_data.export("audio.wav", format="wav")
 
85
  audio_text = recognize_speech("audio.wav")
86
 
87
  if not st.session_state.history:
88
  pre_prompt = "Te Llamarás Chaman 4.0 y tus respuestas serán sumamente breves."
89
  output, _ = generate(pre_prompt, history=st.session_state.history)
90
  st.session_state.history.append((pre_prompt, output))
91
+
92
  if audio_text:
93
+ output, audio_file = generate(audio_text, history=st.session_state.history)
94
 
95
+ if audio_text:
96
+ st.session_state.history.append((audio_text, output))
97
 
98
  if audio_file is not None:
99
  st.markdown(