Spaces:

gdnartea
/

Chatty_Ashe

Runtime error

gdnartea commited on May 2, 2024

Commit

197f7f7

verified ·

1 Parent(s): 34212d7

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -40,11 +40,10 @@ decoding_cfg = model.cfg.decoding
 decoding_cfg.beam.beam_size = 1
 model.change_decoding_strategy(decoding_cfg)
-'''
 vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
 vits_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
 set_seed(555)
-'''
 def text_to_speech(text_response):
     inputs = vits_tokenizer(text=text_response, return_tensors="pt")
@@ -87,7 +86,7 @@ def transcribe(audio_filepath):
 			"source_lang": "en",
 			"target_lang": "en",
 			"taskname": "asr",
-			"pnc": "no",
 			"answer": "predict",
 			"duration": str(duration),
 		}
@@ -121,12 +120,12 @@ def generate_response(user_input):
     with torch.no_grad():
         outputs = proc_model.generate(
             inputs,
-            max_new_tokens=32,
         )
     response = proc_tokenizer.batch_decode(
         outputs,
-        #skip_special_tokens=True,
         clean_up_tokenization_spaces=False,
     )[0]
@@ -137,15 +136,19 @@ def CanaryPhi(audio_filepath):
     print(user_input)
     response = generate_response(user_input)
     print(response)
     return response
 # Create a Gradio interface
 iface = gr.Interface(
     fn=CanaryPhi,
-    inputs=gr.Audio(sources="microphone", type="filepath"),
-    outputs=gr.Textbox(),
 )
 # Launch the interface
 iface.launch()

 decoding_cfg.beam.beam_size = 1
 model.change_decoding_strategy(decoding_cfg)
 vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
 vits_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
 set_seed(555)
 def text_to_speech(text_response):
     inputs = vits_tokenizer(text=text_response, return_tensors="pt")
 			"source_lang": "en",
 			"target_lang": "en",
 			"taskname": "asr",
+			"pnc": "yes",
 			"answer": "predict",
 			"duration": str(duration),
 		}
     with torch.no_grad():
         outputs = proc_model.generate(
             inputs,
+            max_new_tokens=48,
         )
     response = proc_tokenizer.batch_decode(
         outputs,
+        skip_special_tokens=True,
         clean_up_tokenization_spaces=False,
     )[0]
     print(user_input)
     response = generate_response(user_input)
     print(response)
+    chatty_response = text_to_speech(response)
     return response
 # Create a Gradio interface
 iface = gr.Interface(
     fn=CanaryPhi,
+    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
+    #inputs=gr.Textbox(lines=5, placeholder="Enter your text here..."),
+    #outputs=gr.Textbox(),
+    outputs=gr.Audio("response.wav"),
 )
 # Launch the interface
+iface.queue()
 iface.launch()