Porjaz commited on
Commit
14c8a1d
·
verified ·
1 Parent(s): 29c051a

Upload 4 files

Browse files
Files changed (4) hide show
  1. 1000_unigram.model +3 -0
  2. README.md +4 -4
  3. app.py +396 -0
  4. requirements.txt +5 -0
1000_unigram.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35a3a5130d52af7c3eb92cbf0c05bfed2f43c3204f3d17941a71cf8b46c84894
3
+ size 257888
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Macedonian ASR Demo Wav2vec2
3
- emoji: 📚
4
- colorFrom: red
5
  colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 4.44.1
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-4.0
 
1
  ---
2
+ title: Macedonian ASR Demo
3
+ emoji: 👁
4
+ colorFrom: purple
5
  colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 4.41.0
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-4.0
app.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
3
+
4
+ import gc
5
+ from functools import partial
6
+ import gradio as gr
7
+ import torch
8
+ from speechbrain.inference.interfaces import Pretrained, foreign_class
9
+ from transformers import T5Tokenizer, T5ForConditionalGeneration
10
+ import librosa
11
+ import whisper_timestamped as whisper
12
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, Wav2Vec2ForCTC, AutoProcessor
13
+
14
+
15
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+ torch.backends.cuda.matmul.allow_tf32 = True
17
+
18
+
19
+ def clean_up_memory():
20
+ gc.collect()
21
+ torch.cuda.empty_cache()
22
+
23
+
24
+ def recap_sentence(string):
25
+ # Restore capitalization and punctuation using the model
26
+ inputs = recap_tokenizer(["restore capitalization and punctuation: " + string], return_tensors="pt", padding=True).to(device)
27
+ outputs = recap_model.generate(**inputs, max_length=768, num_beams=5, early_stopping=True).squeeze(0)
28
+ recap_result = recap_tokenizer.decode(outputs, skip_special_tokens=True)
29
+ return recap_result
30
+
31
+
32
+ def return_prediction_w2v2(mic=None, file=None, device=device):
33
+ if mic is not None:
34
+ waveform, sr = librosa.load(mic, sr=16000)
35
+ waveform = waveform[:30*sr]
36
+ result_generator = w2v2_classifier.classify_file_w2v2(waveform, device)
37
+ elif file is not None:
38
+ waveform, sr = librosa.load(file, sr=16000)
39
+ waveform = waveform[:30*sr]
40
+ result_generator = w2v2_classifier.classify_file_w2v2(waveform, device)
41
+ else:
42
+ yield "You must either provide a mic recording or a file"
43
+ return
44
+
45
+ segment_results = ""
46
+ prev_segment = ""
47
+
48
+ # Loop through the partial results from classify_file
49
+ for i, partial_result in enumerate(result_generator):
50
+ # Convert the partial result to a readable string
51
+ partial_result = [" ".join(res) for res in partial_result]
52
+ partial_result = partial_result[0]
53
+
54
+ if prev_segment == "":
55
+ partial_result_hist = partial_result
56
+ else:
57
+ partial_result_hist = prev_segment + " " + partial_result
58
+
59
+ recap_result = recap_sentence(partial_result_hist)
60
+
61
+ if i == 0:
62
+ segment_results += recap_result
63
+ else:
64
+ recap_result = recap_result.split(" ")
65
+ prev_segment = prev_segment.split(" ")
66
+ recap_result = recap_result[len(prev_segment):]
67
+ segment_results += " " + " ".join(recap_result)
68
+
69
+ prev_segment = partial_result
70
+
71
+ # If the letter after punct is small, recap it
72
+ for i, letter in enumerate(segment_results):
73
+ if i > 1 and segment_results[i-2] in [".", "!", "?"] and letter.islower():
74
+ segment_results = segment_results[:i] + letter.upper() + segment_results[i+1:]
75
+
76
+ clean_up_memory()
77
+ yield segment_results
78
+
79
+
80
+
81
+ def return_prediction_whisper(mic=None, file=None, device=device):
82
+ if mic is not None:
83
+ waveform, sr = librosa.load(mic, sr=16000)
84
+ waveform = waveform[:30*sr]
85
+ result_generator = whisper_classifier.classify_file_whisper_mkd(waveform, device)
86
+ elif file is not None:
87
+ waveform, sr = librosa.load(file, sr=16000)
88
+ waveform = waveform[:30*sr]
89
+ result_generator = whisper_classifier.classify_file_whisper_mkd(waveform, device)
90
+ else:
91
+ yield "You must either provide a mic recording or a file"
92
+ return
93
+
94
+ segment_results = ""
95
+ prev_segment = ""
96
+
97
+ # Loop through the partial results from classify_file
98
+ for i, partial_result in enumerate(result_generator):
99
+ # Convert the partial result to a readable string
100
+ partial_result = ["".join(res) for res in partial_result]
101
+ partial_result = partial_result[0]
102
+
103
+ if prev_segment == "":
104
+ partial_result_hist = partial_result
105
+ else:
106
+ partial_result_hist = prev_segment + " " + partial_result
107
+
108
+ recap_result = recap_sentence(partial_result_hist)
109
+
110
+ if i == 0:
111
+ segment_results += recap_result
112
+ else:
113
+ recap_result = recap_result.split(" ")
114
+ prev_segment = prev_segment.split(" ")
115
+ recap_result = recap_result[len(prev_segment):]
116
+ segment_results += " " + " ".join(recap_result)
117
+
118
+ prev_segment = partial_result
119
+
120
+ # If the letter after punct is small, recap it
121
+ for i, letter in enumerate(segment_results):
122
+ if i > 1 and segment_results[i-2] in [".", "!", "?"] and letter.islower():
123
+ segment_results = segment_results[:i] + letter.upper() + segment_results[i+1:]
124
+
125
+ clean_up_memory()
126
+ yield segment_results
127
+
128
+
129
+ def return_prediction_compare(mic=None, file=None, device=device):
130
+ # pipe_whisper.model.to(device)
131
+ # mms_model.to(device)
132
+ if mic is not None:
133
+ waveform, sr = librosa.load(mic, sr=16000)
134
+ waveform = waveform[:30*sr]
135
+ result_generator_whisper = whisper_classifier.classify_file_whisper_mkd(waveform, device)
136
+ # result_generator_w2v2 = w2v2_classifier.classify_file_w2v2(mic, device)
137
+ whisper_result = whisper_classifier.classify_file_whisper(waveform, pipe_whisper, device)
138
+ mms_result_generator = whisper_classifier.classify_file_mms(waveform, processor_mms, mms_model, device)
139
+
140
+ elif file is not None:
141
+ waveform, sr = librosa.load(file, sr=16000)
142
+ waveform = waveform[:30*sr]
143
+ result_generator_whisper = whisper_classifier.classify_file_whisper_mkd(waveform, device)
144
+ # result_generator_w2v2 = w2v2_classifier.classify_file_w2v2(file, device)
145
+ whisper_result = whisper_classifier.classify_file_whisper(waveform, pipe_whisper, device)
146
+ mms_result_generator = whisper_classifier.classify_file_mms(waveform, processor_mms, mms_model, device)
147
+ else:
148
+ yield "You must either provide a mic recording or a file"
149
+ return
150
+ # pipe_whisper.model.to("cpu")
151
+ # mms_model.to("cpu")
152
+
153
+ segment_results_whisper = ""
154
+ prev_segment_whisper = ""
155
+ # segment_results_w2v2 = ""
156
+ # prev_segment_w2v2 = ""
157
+ segment_results_mms = ""
158
+ prev_segment_mms = ""
159
+
160
+ # Loop through the partial results from classify_file
161
+ # for i, (partial_result_whisper, partial_result_w2v2, partial_result_mms) in enumerate(zip(result_generator_whisper, result_generator_w2v2, mms_result_generator)):
162
+ for i, (partial_result_whisper, partial_result_mms) in enumerate(zip(result_generator_whisper, mms_result_generator)):
163
+ # Convert the partial result to a readable string
164
+ partial_result_whisper = ["".join(res) for res in partial_result_whisper]
165
+ partial_result_whisper = partial_result_whisper[0]
166
+
167
+ # partial_result_w2v2 = [" ".join(res) for res in partial_result_w2v2]
168
+ # partial_result_w2v2 = partial_result_w2v2[0]
169
+
170
+ if prev_segment_whisper == "":
171
+ partial_result_hist_whisper = partial_result_whisper
172
+ else:
173
+ partial_result_hist_whisper = prev_segment_whisper + " " + partial_result_whisper
174
+
175
+ # if prev_segment_w2v2 == "":
176
+ # partial_result_hist_w2v2 = partial_result_w2v2
177
+ # else:
178
+ # partial_result_hist_w2v2 = prev_segment_w2v2 + " " + partial_result_w2v2
179
+
180
+ if prev_segment_mms == "":
181
+ partial_result_hist_mms = partial_result_mms
182
+ else:
183
+ partial_result_hist_mms = prev_segment_mms + " " + partial_result_mms
184
+ # Restore capitalization and punctuation using the model
185
+ recap_result_whisper = recap_sentence(partial_result_hist_whisper)
186
+ # recap_result_w2v2 = recap_sentence(partial_result_hist_w2v2)
187
+ recap_result_mms = recap_sentence(partial_result_hist_mms)
188
+ if i == 0:
189
+ segment_results_whisper += recap_result_whisper
190
+ # segment_results_w2v2 += recap_result_w2v2
191
+ segment_results_mms += recap_result_mms
192
+ else:
193
+ recap_result_whisper = recap_result_whisper.split(" ")
194
+ prev_segment_whisper = prev_segment_whisper.split(" ")
195
+ recap_result_whisper = recap_result_whisper[len(prev_segment_whisper):]
196
+ segment_results_whisper += " " + " ".join(recap_result_whisper)
197
+
198
+ # recap_result_w2v2 = recap_result_w2v2.split(" ")
199
+ # prev_segment_w2v2 = prev_segment_w2v2.split(" ")
200
+ # recap_result_w2v2 = recap_result_w2v2[len(prev_segment_w2v2):]
201
+ # segment_results_w2v2 += " " + " ".join(recap_result_w2v2)
202
+
203
+ recap_result_mms = recap_result_mms.split(" ")
204
+ prev_segment_mms = prev_segment_mms.split(" ")
205
+ recap_result_mms = recap_result_mms[len(prev_segment_mms):]
206
+ segment_results_mms += " " + " ".join(recap_result_mms)
207
+
208
+ prev_segment_whisper = partial_result_hist_whisper
209
+ # prev_segment_w2v2 = partial_result_hist_w2v2
210
+ prev_segment_mms = partial_result_mms
211
+
212
+ # If the letter after punct is small, recap it
213
+
214
+ # Whisper
215
+ for i, letter in enumerate(segment_results_whisper):
216
+ if i > 1 and segment_results_whisper[i-2] in [".", "!", "?"] and letter.islower():
217
+ segment_results_whisper = segment_results_whisper[:i] + letter.upper() + segment_results_whisper[i+1:]
218
+
219
+ # W2V2
220
+ # for i, letter in enumerate(segment_results_w2v2):
221
+ # if i > 1 and segment_results_w2v2[i-2] in [".", "!", "?"] and letter.islower():
222
+ # segment_results_w2v2 = segment_results_w2v2[:i] + letter.upper() + segment_results_w2v2[i+1:]
223
+
224
+ # MMS
225
+ for i, letter in enumerate(segment_results_mms):
226
+ if i > 1 and segment_results_mms[i-2] in [".", "!", "?"] and letter.islower():
227
+ segment_results_mms = segment_results_mms[:i] + letter.upper() + segment_results_mms[i+1:]
228
+
229
+ clean_up_memory()
230
+ yield "Буки-Whisper:\n" + segment_results_whisper + "\n\n" + "MMS:\n" + segment_results_mms + "\n\n" + "OpenAI Whisper:\n" + whisper_result
231
+ # yield "Our W2v2: \n" + segment_results_w2v2 + "\n\n" + "MMS transcript:\n" + segment_results_mms
232
+
233
+
234
+
235
+ # Load Whisper model
236
+ model_id = "openai/whisper-large-v3"
237
+ whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="sdpa")
238
+ whisper_model.to(device)
239
+ # whisper_model = torch.compile(whisper_model, backend="inductor")
240
+ # whisper_model.generation_config.cache_implementation = "static"
241
+ # whisper_model.forward = torch.compile(whisper_model.forward, mode="reduce-overhead", fullgraph=True)
242
+ processor = AutoProcessor.from_pretrained(model_id)
243
+ pipe_whisper = pipeline(
244
+ "automatic-speech-recognition",
245
+ model=whisper_model,
246
+ tokenizer=processor.tokenizer,
247
+ feature_extractor=processor.feature_extractor,
248
+ torch_dtype=torch.float16,
249
+ return_timestamps=True,
250
+ device=device,
251
+ )
252
+
253
+
254
+ # Load MMS model
255
+ model_id = "facebook/mms-1b-all"
256
+ processor_mms = AutoProcessor.from_pretrained(model_id)
257
+ mms_model = Wav2Vec2ForCTC.from_pretrained(model_id)
258
+ mms_model = mms_model.to(device)
259
+ mms_model.eval()
260
+ processor_mms.tokenizer.set_target_lang("mkd")
261
+ mms_model.load_adapter("mkd")
262
+
263
+
264
+
265
+ # Create a partial function with the device pre-applied
266
+ return_prediction_whisper_with_device = partial(return_prediction_whisper, device=device)
267
+ return_prediction_w2v2_with_device = partial(return_prediction_w2v2, device=device)
268
+ return_prediction_with_device_compare = partial(return_prediction_compare, device=device)
269
+
270
+
271
+ # Load the ASR models
272
+ w2v2_classifier = foreign_class(source="Macedonian-ASR/wav2vec2-aed-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
273
+ w2v2_classifier = w2v2_classifier.to(device)
274
+ w2v2_classifier.eval()
275
+ whisper_classifier = foreign_class(source="Macedonian-ASR/whisper-large-v3-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
276
+ whisper_classifier = whisper_classifier.to(device)
277
+ whisper_classifier.eval()
278
+
279
+
280
+ # Load the T5 tokenizer and model for restoring capitalization
281
+ recap_model_name = "Macedonian-ASR/mt5-restore-capitalization-macedonian"
282
+ recap_tokenizer = T5Tokenizer.from_pretrained(recap_model_name)
283
+ recap_model = T5ForConditionalGeneration.from_pretrained(recap_model_name, torch_dtype=torch.float16)
284
+ recap_model.to(device)
285
+ recap_model.eval()
286
+
287
+
288
+ mic_transcribe_whisper = gr.Interface(
289
+ fn=return_prediction_whisper_with_device,
290
+ inputs=gr.Audio(sources="microphone", type="filepath"),
291
+ outputs=gr.Textbox(),
292
+ allow_flagging="never",
293
+ live=True,
294
+ )
295
+
296
+ # file_transcribe_whisper = gr.Interface(
297
+ # fn=return_prediction_whisper_with_device,
298
+ # inputs=gr.Audio(sources="upload", type="filepath"),
299
+ # outputs=gr.Textbox(),
300
+ # allow_flagging="never",
301
+ # live=True
302
+ # )
303
+
304
+ mic_transcribe_w2v2 = gr.Interface(
305
+ fn=return_prediction_w2v2_with_device,
306
+ inputs=gr.Audio(sources="microphone", type="filepath"),
307
+ outputs=gr.Textbox(),
308
+ allow_flagging="never",
309
+ live=True,
310
+ )
311
+
312
+
313
+ # file_transcribe_w2v2 = gr.Interface(
314
+ # fn=return_prediction_w2v2_with_device,
315
+ # inputs=gr.Audio(sources="upload", type="filepath"),
316
+ # outputs=gr.Textbox(),
317
+ # allow_flagging="never",
318
+ # live=True
319
+ # )
320
+
321
+ mic_transcribe_compare = gr.Interface(
322
+ fn=return_prediction_with_device_compare,
323
+ inputs=gr.Audio(sources="microphone", type="filepath"),
324
+ outputs=gr.Textbox(),
325
+ allow_flagging="never",
326
+ live=True,
327
+ )
328
+
329
+ # file_transcribe_compare = gr.Interface(
330
+ # fn=return_prediction_with_device_compare,
331
+ # inputs=gr.Audio(sources="upload", type="filepath"),
332
+ # outputs=gr.Textbox(),
333
+ # allow_flagging="never",
334
+ # live=True
335
+ # )
336
+
337
+
338
+ project_description = '''
339
+ ## Автори:
340
+ 1. **Дејан Порјазовски**
341
+ 2. **Илина Јакимовска**
342
+ 3. **Ордан Чукалиев**
343
+ 4. **Никола Стиков**
344
+
345
+ Оваа колаборација е дел од активностите на **Центарот за напредни интердисциплинарни истражувања ([ЦеНИИс](https://ukim.edu.mk/en/centri/centar-za-napredni-interdisciplinarni-istrazhuvanja-ceniis))** при УКИМ.
346
+
347
+ ## Во тренирањето на овој модел се употребени податоци од:
348
+ 1. Дигитален архив за етнолошки и антрополошки ресурси (ДАЕАР) при Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
349
+ 2. Аудио верзија на меѓународното списание „ЕтноАнтропоЗум“ на Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
350
+ 3. Аудио подкастот „Обични луѓе“ на Илина Јакимовска
351
+ 4. Научните видеа од серијалот „Наука за деца“, фондација КАНТАРОТ
352
+ 5. Македонска верзија на Mozilla Common Voice (верзија 18.0)
353
+ '''
354
+
355
+ # Custom CSS
356
+ css = """
357
+ .gradio-container {
358
+ background-color: #f0f0f0; /* Set your desired background color */
359
+ }
360
+
361
+ .custom-markdown p, .custom-markdown li, .custom-markdown h2, .custom-markdown a {
362
+ font-size: 15px !important;
363
+ font-family: Arial, sans-serif !important;
364
+ }
365
+
366
+ .gradio-container {
367
+ background-color: #f3f3f3 !important;
368
+ }
369
+ """
370
+
371
+ transcriber_app = gr.Blocks(css=css, delete_cache=(60, 120))
372
+
373
+ with transcriber_app:
374
+ state = gr.State()
375
+ gr.Markdown(project_description, elem_classes="custom-markdown")
376
+
377
+ # gr.TabbedInterface(
378
+ # [mic_transcribe_whisper, mic_transcribe_compare],
379
+ # ["Буки-Whisper транскрипција", "Споредба на модели"],
380
+ # )
381
+ # state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))
382
+
383
+ gr.TabbedInterface(
384
+ [mic_transcribe_whisper, mic_transcribe_w2v2, mic_transcribe_compare],
385
+ ["Буки-Whisper транскрипција", "Буки-W2v2 транскрипција", "Споредба на модели"],
386
+ )
387
+ state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))
388
+
389
+ transcriber_app.unload(return_prediction_whisper)
390
+ # transcriber_app.unload(return_prediction_compare)
391
+
392
+
393
+ # transcriber_app.launch(debug=True, share=True, ssl_verify=False)
394
+ if __name__ == "__main__":
395
+ transcriber_app.queue()
396
+ transcriber_app.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ speechbrain
2
+ transformers
3
+ librosa
4
+ whisper_timestamped
5
+ accelerate