gdnartea commited on
Commit
c4d418c
·
verified ·
1 Parent(s): 9dc2324

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -18
app.py CHANGED
@@ -13,9 +13,6 @@ from nemo.collections.asr.models import ASRModel
13
  from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
14
  from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
15
 
16
- import tracemalloc as tm
17
-
18
- tm.start()
19
 
20
  torch.random.manual_seed(0)
21
  proc_model_name = "microsoft/Phi-3-mini-4k-instruct"
@@ -29,9 +26,6 @@ proc_model = AutoModelForCausalLM.from_pretrained(
29
  proc_model.to("cpu")
30
  proc_tokenizer = AutoTokenizer.from_pretrained(proc_model_name)
31
 
32
- print(tm.get_traced_memory())
33
- tm.stop()
34
-
35
 
36
  SAMPLE_RATE = 16000 # Hz
37
  MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
@@ -45,16 +39,12 @@ decoding_cfg = model.cfg.decoding
45
  decoding_cfg.beam.beam_size = 1
46
  model.change_decoding_strategy(decoding_cfg)
47
 
48
- print(tm.get_traced_memory())
49
-
50
 
51
 
52
  vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
53
  vits_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
54
  set_seed(555)
55
 
56
- print(tm.get_traced_memory())
57
- tm.stop()
58
 
59
  def text_to_speech(text_response):
60
  inputs = vits_tokenizer(text=text_response, return_tensors="pt")
@@ -69,7 +59,6 @@ def text_to_speech(text_response):
69
  def convert_audio(audio_filepath, tmpdir, utt_id):
70
 
71
  data, sr = librosa.load(audio_filepath, sr=None, mono=True)
72
-
73
  duration = librosa.get_duration(y=data, sr=sr)
74
 
75
  if sr != SAMPLE_RATE:
@@ -79,7 +68,6 @@ def convert_audio(audio_filepath, tmpdir, utt_id):
79
 
80
  # save output audio
81
  sf.write(out_filename, data, SAMPLE_RATE)
82
-
83
  return out_filename, duration
84
 
85
  def transcribe(audio_filepath):
@@ -125,8 +113,6 @@ def generate_response(user_input):
125
  add_generation_prompt=True,
126
  return_tensors="pt",
127
  )
128
-
129
-
130
 
131
  with torch.no_grad():
132
  outputs = proc_model.generate(
@@ -142,19 +128,23 @@ def generate_response(user_input):
142
 
143
  return response
144
 
145
- def CanaryPhi(audio_filepath):
146
- user_input = transcribe(audio_filepath)
147
  print(user_input)
148
  response = generate_response(user_input)
149
  print(response)
150
  chatty_response = text_to_speech(response)
 
 
 
 
151
  return chatty_response
152
 
153
 
154
  # Create a Gradio interface
155
  iface = gr.Interface(
156
- fn=CanaryPhi,
157
- inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
158
  #inputs=gr.Textbox(lines=5, placeholder="Enter your text here..."),
159
  #outputs=gr.Textbox(),
160
  outputs=gr.Audio("response.wav"),
 
13
  from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
14
  from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
15
 
 
 
 
16
 
17
  torch.random.manual_seed(0)
18
  proc_model_name = "microsoft/Phi-3-mini-4k-instruct"
 
26
  proc_model.to("cpu")
27
  proc_tokenizer = AutoTokenizer.from_pretrained(proc_model_name)
28
 
 
 
 
29
 
30
  SAMPLE_RATE = 16000 # Hz
31
  MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
 
39
  decoding_cfg.beam.beam_size = 1
40
  model.change_decoding_strategy(decoding_cfg)
41
 
 
 
42
 
43
 
44
  vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
45
  vits_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
46
  set_seed(555)
47
 
 
 
48
 
49
  def text_to_speech(text_response):
50
  inputs = vits_tokenizer(text=text_response, return_tensors="pt")
 
59
  def convert_audio(audio_filepath, tmpdir, utt_id):
60
 
61
  data, sr = librosa.load(audio_filepath, sr=None, mono=True)
 
62
  duration = librosa.get_duration(y=data, sr=sr)
63
 
64
  if sr != SAMPLE_RATE:
 
68
 
69
  # save output audio
70
  sf.write(out_filename, data, SAMPLE_RATE)
 
71
  return out_filename, duration
72
 
73
  def transcribe(audio_filepath):
 
113
  add_generation_prompt=True,
114
  return_tensors="pt",
115
  )
 
 
116
 
117
  with torch.no_grad():
118
  outputs = proc_model.generate(
 
128
 
129
  return response
130
 
131
+ def CanaryPhiVits(user_voice):
132
+ user_input = transcribe(user_voice)
133
  print(user_input)
134
  response = generate_response(user_input)
135
  print(response)
136
  chatty_response = text_to_speech(response)
137
+
138
+ if chatty_response.startswith(user_input):
139
+ chatty_response = chatty_response.replace(user_input, '', 1)
140
+
141
  return chatty_response
142
 
143
 
144
  # Create a Gradio interface
145
  iface = gr.Interface(
146
+ fn=CanaryPhiVits,
147
+ inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", format="wav",),
148
  #inputs=gr.Textbox(lines=5, placeholder="Enter your text here..."),
149
  #outputs=gr.Textbox(),
150
  outputs=gr.Audio("response.wav"),