Spaces:

gdnartea
/

Chatty_Ashe

Runtime error

App Files Files Community

gdnartea commited on May 1, 2024

Commit

c7e3088

verified ·

1 Parent(s): 3e0dbc5

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -14

app.py CHANGED Viewed

@@ -1,36 +1,40 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM, Speech2TextProcessor, Speech2TextForConditionalGeneration, VitsProcessor, VitsForConditionalGeneration
-from nemo.collections.asr.models import EncDecMultiTaskModel
-# Load the ASR model and processor // fix processor stuff first
-asr_processor = Speech2TextProcessor.from_pretrained("/path/to/canary/processor")
-asr_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
-# update dcode params
 decode_cfg = canary_model.cfg.decoding
 decode_cfg.beam.beam_size = 1
 canary_model.change_decoding_strategy(decode_cfg)
 # Load the text processing model and tokenizer
 proc_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
 proc_model = AutoModelForCausalLM.from_pretrained(
     "microsoft/Phi-3-mini-128k-instruct",
     device_map="cuda",
     torch_dtype="auto",
-    trust_remote_code=True,
 # Load the TTS model and processor
-tts_processor = VitsProcessor.from_pretrained("/path/to/vits/processor")
-tts_model = VitsForConditionalGeneration.from_pretrained("/path/to/vits/model")
 def process_speech(speech):
     # Convert the speech to text
-    inputs = asr_processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
-    with torch.no_grad():
-        logits = asr_model(inputs.input_values, attention_mask=inputs.attention_mask).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = asr_processor.decode(predicted_ids[0])
     # Process the text
     inputs = proc_tokenizer.encode(transcription + proc_tokenizer.eos_token, return_tensors='pt')

 import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, VitsForConditionalGeneration, VitsProcessor
+from nemo.collections.asr.models import ASRModel
+# load speech to text model
+canary_model = ASRModel.from_pretrained('nvidia/canary-1b')
+canary_model.eval()
+# update decode params
+canary_model.change_decoding_strategy(None)
 decode_cfg = canary_model.cfg.decoding
 decode_cfg.beam.beam_size = 1
 canary_model.change_decoding_strategy(decode_cfg)
 # Load the text processing model and tokenizer
 proc_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
 proc_model = AutoModelForCausalLM.from_pretrained(
     "microsoft/Phi-3-mini-128k-instruct",
     device_map="cuda",
     torch_dtype="auto",
+    trust_remote_code=True, )
+)
 # Load the TTS model and processor
+tts_processor = VitsProcessor.from_pretrained("facebook/mms-tts-eng")
+tts_model = VitsForConditionalGeneration.from_pretrained("facebook/mms-tts-eng")
 def process_speech(speech):
     # Convert the speech to text
+    transcription = canary_model.transcribe(speech, logprobs=False)
     # Process the text
     inputs = proc_tokenizer.encode(transcription + proc_tokenizer.eos_token, return_tensors='pt')