File size: 797 Bytes
c5f8e1d
c7e3088
0003cc7
d1e03b7
c5f8e1d
3e0dbc5
c7e3088
d1e03b7
c7e3088
cadfe1a
c7e3088
 
 
3e0dbc5
 
 
 
c5f8e1d
c7e3088
7fcc45d
c5f8e1d
d1e03b7
 
 
 
c5f8e1d
7fcc45d
c5f8e1d
7fcc45d
c5f8e1d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, VitsModel
from nemo.collections.asr.models import EncDecMultiTaskModel


# load speech to text model
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')
canary_model.eval()
canary_model.to('cpu')

# update decode params
canary_model.change_decoding_strategy(None)
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)



def convert_speech(speech):
    # Convert the speech to text
    transcription = canary_model.transcribe(
        speech, 
        logprobs=False,
    )

    return transcription

iface = gr.Interface(fn=convert_speech, inputs=gr.inputs.Audio(source="microphone"), outputs="text")

iface.launch()