Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,36 +1,40 @@
|
|
1 |
import gradio as gr
|
2 |
-
|
3 |
-
from
|
|
|
4 |
|
5 |
-
# Load the ASR model and processor // fix processor stuff first
|
6 |
-
asr_processor = Speech2TextProcessor.from_pretrained("/path/to/canary/processor")
|
7 |
-
asr_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
|
8 |
|
9 |
-
#
|
|
|
|
|
|
|
|
|
|
|
10 |
decode_cfg = canary_model.cfg.decoding
|
11 |
decode_cfg.beam.beam_size = 1
|
12 |
canary_model.change_decoding_strategy(decode_cfg)
|
13 |
|
14 |
|
|
|
|
|
|
|
15 |
# Load the text processing model and tokenizer
|
16 |
proc_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
|
17 |
proc_model = AutoModelForCausalLM.from_pretrained(
|
18 |
"microsoft/Phi-3-mini-128k-instruct",
|
19 |
device_map="cuda",
|
20 |
torch_dtype="auto",
|
21 |
-
trust_remote_code=True,
|
|
|
22 |
|
23 |
# Load the TTS model and processor
|
24 |
-
tts_processor = VitsProcessor.from_pretrained("/
|
25 |
-
tts_model = VitsForConditionalGeneration.from_pretrained("/
|
|
|
26 |
|
27 |
def process_speech(speech):
|
28 |
# Convert the speech to text
|
29 |
-
|
30 |
-
with torch.no_grad():
|
31 |
-
logits = asr_model(inputs.input_values, attention_mask=inputs.attention_mask).logits
|
32 |
-
predicted_ids = torch.argmax(logits, dim=-1)
|
33 |
-
transcription = asr_processor.decode(predicted_ids[0])
|
34 |
|
35 |
# Process the text
|
36 |
inputs = proc_tokenizer.encode(transcription + proc_tokenizer.eos_token, return_tensors='pt')
|
|
|
1 |
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, VitsForConditionalGeneration, VitsProcessor
|
4 |
+
from nemo.collections.asr.models import ASRModel
|
5 |
|
|
|
|
|
|
|
6 |
|
7 |
+
# load speech to text model
|
8 |
+
canary_model = ASRModel.from_pretrained('nvidia/canary-1b')
|
9 |
+
canary_model.eval()
|
10 |
+
|
11 |
+
# update decode params
|
12 |
+
canary_model.change_decoding_strategy(None)
|
13 |
decode_cfg = canary_model.cfg.decoding
|
14 |
decode_cfg.beam.beam_size = 1
|
15 |
canary_model.change_decoding_strategy(decode_cfg)
|
16 |
|
17 |
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
# Load the text processing model and tokenizer
|
22 |
proc_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
|
23 |
proc_model = AutoModelForCausalLM.from_pretrained(
|
24 |
"microsoft/Phi-3-mini-128k-instruct",
|
25 |
device_map="cuda",
|
26 |
torch_dtype="auto",
|
27 |
+
trust_remote_code=True, )
|
28 |
+
)
|
29 |
|
30 |
# Load the TTS model and processor
|
31 |
+
tts_processor = VitsProcessor.from_pretrained("facebook/mms-tts-eng")
|
32 |
+
tts_model = VitsForConditionalGeneration.from_pretrained("facebook/mms-tts-eng")
|
33 |
+
|
34 |
|
35 |
def process_speech(speech):
|
36 |
# Convert the speech to text
|
37 |
+
transcription = canary_model.transcribe(speech, logprobs=False)
|
|
|
|
|
|
|
|
|
38 |
|
39 |
# Process the text
|
40 |
inputs = proc_tokenizer.encode(transcription + proc_tokenizer.eos_token, return_tensors='pt')
|