gdnartea commited on
Commit
97bffa5
·
verified ·
1 Parent(s): 0b350c3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -21
app.py CHANGED
@@ -1,29 +1,130 @@
1
- from transformers import pipeline
2
- from gradio import Interface, Audio
3
-
4
- # Create a speech-to-text pipeline
5
- speech_to_text = pipeline("automatic-speech-recognition", model="nvidia/canary-1b")
6
-
7
- # Define a function to convert speech to text
8
- def convert_speech_to_text(audio):
9
- # The pipeline function expects a path to an audio file, so we need to save the audio first
10
- with open('temp.wav', 'wb') as f:
11
- f.write(audio)
12
- # Use the pipeline to convert speech to text
13
- result = speech_to_text('temp.wav')
14
- return result[0]['text']
15
-
16
- # Create a Gradio interface
17
- iface = Interface(
18
- fn=convert_speech_to_text,
19
- inputs=Audio(source="microphone", type="file"),
20
- outputs="text"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  )
22
 
23
- # Launch the interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  iface.launch()
25
 
26
 
27
 
28
 
29
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import librosa
4
+ import os
5
+ import soundfile as sf
6
+ import tempfile
7
+ import uuid
8
+
9
+ import torch
10
+
11
+ from nemo.collections.asr.models import ASRModel
12
+ from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
13
+ from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
14
+
15
+ SAMPLE_RATE = 16000 # Hz
16
+ MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
17
+
18
+ model = ASRModel.from_pretrained("nvidia/canary-1b")
19
+ model.eval()
20
+
21
+ # make sure beam size always 1 for consistency
22
+ model.change_decoding_strategy(None)
23
+ decoding_cfg = model.cfg.decoding
24
+ decoding_cfg.beam.beam_size = 1
25
+ model.change_decoding_strategy(decoding_cfg)
26
+
27
+ # setup for buffered inference
28
+ model.cfg.preprocessor.dither = 0.0
29
+ model.cfg.preprocessor.pad_to = 0
30
+
31
+ feature_stride = model.cfg.preprocessor['window_stride']
32
+ model_stride_in_secs = feature_stride * 8 # 8 = model stride, which is 8 for FastConformer
33
+
34
+ frame_asr = FrameBatchMultiTaskAED(
35
+ asr_model=model,
36
+ frame_len=40.0,
37
+ total_buffer=40.0,
38
+ batch_size=16,
39
  )
40
 
41
+ amp_dtype = torch.float16
42
+
43
+
44
+ def convert_audio(audio_filepath, tmpdir, utt_id):
45
+ """
46
+ Convert all files to monochannel 16 kHz wav files.
47
+ Do not convert and raise error if audio too long.
48
+ Returns output filename and duration.
49
+ """
50
+
51
+ data, sr = librosa.load(audio_filepath, sr=None, mono=True)
52
+
53
+ duration = librosa.get_duration(y=data, sr=sr)
54
+
55
+ if duration / 60.0 > MAX_AUDIO_MINUTES:
56
+ raise gr.Error(
57
+ f"This demo can transcribe up to {MAX_AUDIO_MINUTES} minutes of audio. "
58
+ "If you wish, you may trim the audio using the Audio viewer in Step 1 "
59
+ "(click on the scissors icon to start trimming audio)."
60
+ )
61
+
62
+ if sr != SAMPLE_RATE:
63
+ data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
64
+
65
+ out_filename = os.path.join(tmpdir, utt_id + '.wav')
66
+
67
+ # save output audio
68
+ sf.write(out_filename, data, SAMPLE_RATE)
69
+
70
+ return out_filename, duration
71
+
72
+ def transcribe(audio_filepath):
73
+
74
+ if audio_filepath is None:
75
+ raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone")
76
+
77
+ utt_id = uuid.uuid4()
78
+ with tempfile.TemporaryDirectory() as tmpdir:
79
+ converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
80
+
81
+ # make manifest file and save
82
+ manifest_data = {
83
+ "audio_filepath": converted_audio_filepath,
84
+ "source_lang": "en",
85
+ "target_lang": "en",
86
+ "taskname": "asr",
87
+ "pnc": "no",
88
+ "answer": "predict",
89
+ "duration": str(duration),
90
+ }
91
+
92
+ manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
93
+
94
+ with open(manifest_filepath, 'w') as fout:
95
+ line = json.dumps(manifest_data)
96
+ fout.write(line + '\n')
97
+
98
+ # call transcribe, passing in manifest filepath
99
+ if duration < 40:
100
+ output_text = model.transcribe(manifest_filepath)[0]
101
+ else: # do buffered inference
102
+ with torch.cuda.amp.autocast(dtype=amp_dtype): # TODO: make it work if no cuda
103
+ with torch.no_grad():
104
+ hyps = get_buffered_pred_feat_multitaskAED(
105
+ frame_asr,
106
+ model.cfg.preprocessor,
107
+ model_stride_in_secs,
108
+ model.device,
109
+ manifest=manifest_filepath,
110
+ filepaths=None,
111
+ )
112
+
113
+ output_text = hyps[0].text
114
+
115
+ return output_text
116
+
117
+
118
+ iface = gr.Interface(
119
+ fn=transcribe,
120
+ inputs=gr.Audio(sources="microphone", type="filepath"),
121
+ outputs="text")
122
+
123
+ iface.queue()
124
  iface.launch()
125
 
126
 
127
 
128
 
129
 
130
+