gdnartea commited on
Commit
197f7f7
·
verified ·
1 Parent(s): 34212d7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -7
app.py CHANGED
@@ -40,11 +40,10 @@ decoding_cfg = model.cfg.decoding
40
  decoding_cfg.beam.beam_size = 1
41
  model.change_decoding_strategy(decoding_cfg)
42
 
43
- '''
44
  vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
45
  vits_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
46
  set_seed(555)
47
- '''
48
 
49
  def text_to_speech(text_response):
50
  inputs = vits_tokenizer(text=text_response, return_tensors="pt")
@@ -87,7 +86,7 @@ def transcribe(audio_filepath):
87
  "source_lang": "en",
88
  "target_lang": "en",
89
  "taskname": "asr",
90
- "pnc": "no",
91
  "answer": "predict",
92
  "duration": str(duration),
93
  }
@@ -121,12 +120,12 @@ def generate_response(user_input):
121
  with torch.no_grad():
122
  outputs = proc_model.generate(
123
  inputs,
124
- max_new_tokens=32,
125
  )
126
 
127
  response = proc_tokenizer.batch_decode(
128
  outputs,
129
- #skip_special_tokens=True,
130
  clean_up_tokenization_spaces=False,
131
  )[0]
132
 
@@ -137,15 +136,19 @@ def CanaryPhi(audio_filepath):
137
  print(user_input)
138
  response = generate_response(user_input)
139
  print(response)
 
140
  return response
141
 
142
 
143
  # Create a Gradio interface
144
  iface = gr.Interface(
145
  fn=CanaryPhi,
146
- inputs=gr.Audio(sources="microphone", type="filepath"),
147
- outputs=gr.Textbox(),
 
 
148
  )
149
 
150
  # Launch the interface
 
151
  iface.launch()
 
40
  decoding_cfg.beam.beam_size = 1
41
  model.change_decoding_strategy(decoding_cfg)
42
 
 
43
  vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
44
  vits_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
45
  set_seed(555)
46
+
47
 
48
  def text_to_speech(text_response):
49
  inputs = vits_tokenizer(text=text_response, return_tensors="pt")
 
86
  "source_lang": "en",
87
  "target_lang": "en",
88
  "taskname": "asr",
89
+ "pnc": "yes",
90
  "answer": "predict",
91
  "duration": str(duration),
92
  }
 
120
  with torch.no_grad():
121
  outputs = proc_model.generate(
122
  inputs,
123
+ max_new_tokens=48,
124
  )
125
 
126
  response = proc_tokenizer.batch_decode(
127
  outputs,
128
+ skip_special_tokens=True,
129
  clean_up_tokenization_spaces=False,
130
  )[0]
131
 
 
136
  print(user_input)
137
  response = generate_response(user_input)
138
  print(response)
139
+ chatty_response = text_to_speech(response)
140
  return response
141
 
142
 
143
  # Create a Gradio interface
144
  iface = gr.Interface(
145
  fn=CanaryPhi,
146
+ inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
147
+ #inputs=gr.Textbox(lines=5, placeholder="Enter your text here..."),
148
+ #outputs=gr.Textbox(),
149
+ outputs=gr.Audio("response.wav"),
150
  )
151
 
152
  # Launch the interface
153
+ iface.queue()
154
  iface.launch()