gdnartea commited on
Commit
c7e3088
·
verified ·
1 Parent(s): 3e0dbc5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -14
app.py CHANGED
@@ -1,36 +1,40 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, Speech2TextProcessor, Speech2TextForConditionalGeneration, VitsProcessor, VitsForConditionalGeneration
3
- from nemo.collections.asr.models import EncDecMultiTaskModel
 
4
 
5
- # Load the ASR model and processor // fix processor stuff first
6
- asr_processor = Speech2TextProcessor.from_pretrained("/path/to/canary/processor")
7
- asr_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
8
 
9
- # update dcode params
 
 
 
 
 
10
  decode_cfg = canary_model.cfg.decoding
11
  decode_cfg.beam.beam_size = 1
12
  canary_model.change_decoding_strategy(decode_cfg)
13
 
14
 
 
 
 
15
  # Load the text processing model and tokenizer
16
  proc_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
17
  proc_model = AutoModelForCausalLM.from_pretrained(
18
  "microsoft/Phi-3-mini-128k-instruct",
19
  device_map="cuda",
20
  torch_dtype="auto",
21
- trust_remote_code=True,
 
22
 
23
  # Load the TTS model and processor
24
- tts_processor = VitsProcessor.from_pretrained("/path/to/vits/processor")
25
- tts_model = VitsForConditionalGeneration.from_pretrained("/path/to/vits/model")
 
26
 
27
  def process_speech(speech):
28
  # Convert the speech to text
29
- inputs = asr_processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
30
- with torch.no_grad():
31
- logits = asr_model(inputs.input_values, attention_mask=inputs.attention_mask).logits
32
- predicted_ids = torch.argmax(logits, dim=-1)
33
- transcription = asr_processor.decode(predicted_ids[0])
34
 
35
  # Process the text
36
  inputs = proc_tokenizer.encode(transcription + proc_tokenizer.eos_token, return_tensors='pt')
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, VitsForConditionalGeneration, VitsProcessor
4
+ from nemo.collections.asr.models import ASRModel
5
 
 
 
 
6
 
7
+ # load speech to text model
8
+ canary_model = ASRModel.from_pretrained('nvidia/canary-1b')
9
+ canary_model.eval()
10
+
11
+ # update decode params
12
+ canary_model.change_decoding_strategy(None)
13
  decode_cfg = canary_model.cfg.decoding
14
  decode_cfg.beam.beam_size = 1
15
  canary_model.change_decoding_strategy(decode_cfg)
16
 
17
 
18
+
19
+
20
+
21
  # Load the text processing model and tokenizer
22
  proc_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
23
  proc_model = AutoModelForCausalLM.from_pretrained(
24
  "microsoft/Phi-3-mini-128k-instruct",
25
  device_map="cuda",
26
  torch_dtype="auto",
27
+ trust_remote_code=True, )
28
+ )
29
 
30
  # Load the TTS model and processor
31
+ tts_processor = VitsProcessor.from_pretrained("facebook/mms-tts-eng")
32
+ tts_model = VitsForConditionalGeneration.from_pretrained("facebook/mms-tts-eng")
33
+
34
 
35
  def process_speech(speech):
36
  # Convert the speech to text
37
+ transcription = canary_model.transcribe(speech, logprobs=False)
 
 
 
 
38
 
39
  # Process the text
40
  inputs = proc_tokenizer.encode(transcription + proc_tokenizer.eos_token, return_tensors='pt')