Spaces:

Macedonian-ASR
/

Bookie-w2v2-Macedonian-ASR

Running

App Files Files Community

Porjaz commited on Oct 8, 2024

Commit

14c8a1d

verified ·

1 Parent(s): 29c051a

Upload 4 files

Browse files

Files changed (4) hide show

1000_unigram.model +3 -0
README.md +4 -4
app.py +396 -0
requirements.txt +5 -0

1000_unigram.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:35a3a5130d52af7c3eb92cbf0c05bfed2f43c3204f3d17941a71cf8b46c84894
+size 257888

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: Macedonian ASR Demo Wav2vec2
-emoji: 📚
-colorFrom: red
 colorTo: yellow
 sdk: gradio
-sdk_version: 4.44.1
 app_file: app.py
 pinned: false
 license: cc-by-4.0

 ---
+title: Macedonian ASR Demo
+emoji: 👁
+colorFrom: purple
 colorTo: yellow
 sdk: gradio
+sdk_version: 4.41.0
 app_file: app.py
 pinned: false
 license: cc-by-4.0

app.py ADDED Viewed

	@@ -0,0 +1,396 @@

+import os
+# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
+import gc
+from functools import partial
+import gradio as gr
+import torch
+from speechbrain.inference.interfaces import Pretrained, foreign_class
+from transformers import T5Tokenizer, T5ForConditionalGeneration
+import librosa
+import whisper_timestamped as whisper
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, Wav2Vec2ForCTC, AutoProcessor
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch.backends.cuda.matmul.allow_tf32 = True
+def clean_up_memory():
+    gc.collect()
+    torch.cuda.empty_cache()
+def recap_sentence(string):
+    # Restore capitalization and punctuation using the model
+    inputs = recap_tokenizer(["restore capitalization and punctuation: " + string], return_tensors="pt", padding=True).to(device)
+    outputs = recap_model.generate(**inputs, max_length=768, num_beams=5, early_stopping=True).squeeze(0)
+    recap_result = recap_tokenizer.decode(outputs, skip_special_tokens=True)
+    return recap_result
+def return_prediction_w2v2(mic=None, file=None, device=device):
+    if mic is not None:
+        waveform, sr = librosa.load(mic, sr=16000)
+        waveform = waveform[:30*sr]
+        result_generator = w2v2_classifier.classify_file_w2v2(waveform, device)
+    elif file is not None:
+        waveform, sr = librosa.load(file, sr=16000)
+        waveform = waveform[:30*sr]
+        result_generator = w2v2_classifier.classify_file_w2v2(waveform, device)
+    else:
+        yield "You must either provide a mic recording or a file"
+        return
+    segment_results = ""
+    prev_segment = ""
+    # Loop through the partial results from classify_file
+    for i, partial_result in enumerate(result_generator):
+        # Convert the partial result to a readable string
+        partial_result = [" ".join(res) for res in partial_result]
+        partial_result = partial_result[0]
+        if prev_segment == "":
+            partial_result_hist = partial_result
+        else:
+            partial_result_hist = prev_segment + " " + partial_result
+        recap_result = recap_sentence(partial_result_hist)
+        if i == 0:
+            segment_results += recap_result
+        else:
+            recap_result = recap_result.split(" ")
+            prev_segment = prev_segment.split(" ")
+            recap_result = recap_result[len(prev_segment):]
+            segment_results += " " + " ".join(recap_result)
+        prev_segment = partial_result
+        # If the letter after punct is small, recap it
+        for i, letter in enumerate(segment_results):
+            if i > 1 and segment_results[i-2] in [".", "!", "?"] and letter.islower():
+                segment_results = segment_results[:i] + letter.upper() + segment_results[i+1:]
+        clean_up_memory()
+        yield segment_results
+def return_prediction_whisper(mic=None, file=None, device=device):
+    if mic is not None:
+        waveform, sr = librosa.load(mic, sr=16000)
+        waveform = waveform[:30*sr]
+        result_generator = whisper_classifier.classify_file_whisper_mkd(waveform, device)
+    elif file is not None:
+        waveform, sr = librosa.load(file, sr=16000)
+        waveform = waveform[:30*sr]
+        result_generator = whisper_classifier.classify_file_whisper_mkd(waveform, device)
+    else:
+        yield "You must either provide a mic recording or a file"
+        return
+    segment_results = ""
+    prev_segment = ""
+    # Loop through the partial results from classify_file
+    for i, partial_result in enumerate(result_generator):
+        # Convert the partial result to a readable string
+        partial_result = ["".join(res) for res in partial_result]
+        partial_result = partial_result[0]
+        if prev_segment == "":
+            partial_result_hist = partial_result
+        else:
+            partial_result_hist = prev_segment + " " + partial_result
+        recap_result = recap_sentence(partial_result_hist)
+        if i == 0:
+            segment_results += recap_result
+        else:
+            recap_result = recap_result.split(" ")
+            prev_segment = prev_segment.split(" ")
+            recap_result = recap_result[len(prev_segment):]
+            segment_results += " " + " ".join(recap_result)
+        prev_segment = partial_result
+        # If the letter after punct is small, recap it
+        for i, letter in enumerate(segment_results):
+            if i > 1 and segment_results[i-2] in [".", "!", "?"] and letter.islower():
+                segment_results = segment_results[:i] + letter.upper() + segment_results[i+1:]
+        clean_up_memory()
+        yield segment_results
+def return_prediction_compare(mic=None, file=None, device=device):
+    # pipe_whisper.model.to(device)
+    # mms_model.to(device)
+    if mic is not None:
+        waveform, sr = librosa.load(mic, sr=16000)
+        waveform = waveform[:30*sr]
+        result_generator_whisper = whisper_classifier.classify_file_whisper_mkd(waveform, device)
+        # result_generator_w2v2 = w2v2_classifier.classify_file_w2v2(mic, device)
+        whisper_result = whisper_classifier.classify_file_whisper(waveform, pipe_whisper, device)
+        mms_result_generator = whisper_classifier.classify_file_mms(waveform, processor_mms, mms_model, device)
+    elif file is not None:
+        waveform, sr = librosa.load(file, sr=16000)
+        waveform = waveform[:30*sr]
+        result_generator_whisper = whisper_classifier.classify_file_whisper_mkd(waveform, device)
+        # result_generator_w2v2 = w2v2_classifier.classify_file_w2v2(file, device)
+        whisper_result = whisper_classifier.classify_file_whisper(waveform, pipe_whisper, device)
+        mms_result_generator = whisper_classifier.classify_file_mms(waveform, processor_mms, mms_model, device)
+    else:
+        yield "You must either provide a mic recording or a file"
+        return
+    # pipe_whisper.model.to("cpu")
+    # mms_model.to("cpu")
+    segment_results_whisper = ""
+    prev_segment_whisper = ""
+    # segment_results_w2v2 = ""
+    # prev_segment_w2v2 = ""
+    segment_results_mms = ""
+    prev_segment_mms = ""
+    # Loop through the partial results from classify_file
+    # for i, (partial_result_whisper, partial_result_w2v2, partial_result_mms) in enumerate(zip(result_generator_whisper, result_generator_w2v2, mms_result_generator)):
+    for i, (partial_result_whisper, partial_result_mms) in enumerate(zip(result_generator_whisper, mms_result_generator)):
+        # Convert the partial result to a readable string
+        partial_result_whisper = ["".join(res) for res in partial_result_whisper]
+        partial_result_whisper = partial_result_whisper[0]
+        # partial_result_w2v2 = [" ".join(res) for res in partial_result_w2v2]
+        # partial_result_w2v2 = partial_result_w2v2[0]
+        if prev_segment_whisper == "":
+            partial_result_hist_whisper = partial_result_whisper
+        else:
+            partial_result_hist_whisper = prev_segment_whisper + " " + partial_result_whisper
+        # if prev_segment_w2v2 == "":
+        #     partial_result_hist_w2v2 = partial_result_w2v2
+        # else:
+        #     partial_result_hist_w2v2 = prev_segment_w2v2 + " " + partial_result_w2v2
+        if prev_segment_mms == "":
+            partial_result_hist_mms = partial_result_mms
+        else:
+            partial_result_hist_mms = prev_segment_mms + " " + partial_result_mms
+        # Restore capitalization and punctuation using the model
+        recap_result_whisper = recap_sentence(partial_result_hist_whisper)
+        # recap_result_w2v2 = recap_sentence(partial_result_hist_w2v2)
+        recap_result_mms = recap_sentence(partial_result_hist_mms)
+        if i == 0:
+            segment_results_whisper += recap_result_whisper
+            # segment_results_w2v2 += recap_result_w2v2
+            segment_results_mms += recap_result_mms
+        else:
+            recap_result_whisper = recap_result_whisper.split(" ")
+            prev_segment_whisper = prev_segment_whisper.split(" ")
+            recap_result_whisper = recap_result_whisper[len(prev_segment_whisper):]
+            segment_results_whisper += " " + " ".join(recap_result_whisper)
+            # recap_result_w2v2 = recap_result_w2v2.split(" ")
+            # prev_segment_w2v2 = prev_segment_w2v2.split(" ")
+            # recap_result_w2v2 = recap_result_w2v2[len(prev_segment_w2v2):]
+            # segment_results_w2v2 += " " + " ".join(recap_result_w2v2)
+            recap_result_mms = recap_result_mms.split(" ")
+            prev_segment_mms = prev_segment_mms.split(" ")
+            recap_result_mms = recap_result_mms[len(prev_segment_mms):]
+            segment_results_mms += " " + " ".join(recap_result_mms)
+        prev_segment_whisper = partial_result_hist_whisper
+        # prev_segment_w2v2 = partial_result_hist_w2v2
+        prev_segment_mms = partial_result_mms
+        # If the letter after punct is small, recap it
+        # Whisper
+        for i, letter in enumerate(segment_results_whisper):
+            if i > 1 and segment_results_whisper[i-2] in [".", "!", "?"] and letter.islower():
+                segment_results_whisper = segment_results_whisper[:i] + letter.upper() + segment_results_whisper[i+1:]
+        # W2V2
+        # for i, letter in enumerate(segment_results_w2v2):
+        #     if i > 1 and segment_results_w2v2[i-2] in [".", "!", "?"] and letter.islower():
+        #         segment_results_w2v2 = segment_results_w2v2[:i] + letter.upper() + segment_results_w2v2[i+1:]
+        # MMS
+        for i, letter in enumerate(segment_results_mms):
+            if i > 1 and segment_results_mms[i-2] in [".", "!", "?"] and letter.islower():
+                segment_results_mms = segment_results_mms[:i] + letter.upper() + segment_results_mms[i+1:]
+        clean_up_memory()
+        yield "Буки-Whisper:\n" + segment_results_whisper + "\n\n" + "MMS:\n" + segment_results_mms + "\n\n" + "OpenAI Whisper:\n" + whisper_result
+        # yield "Our W2v2: \n" + segment_results_w2v2 + "\n\n" + "MMS transcript:\n" + segment_results_mms
+# Load Whisper model
+model_id = "openai/whisper-large-v3"
+whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="sdpa")
+whisper_model.to(device)
+# whisper_model = torch.compile(whisper_model, backend="inductor")
+# whisper_model.generation_config.cache_implementation = "static"
+# whisper_model.forward = torch.compile(whisper_model.forward, mode="reduce-overhead", fullgraph=True)
+processor = AutoProcessor.from_pretrained(model_id)
+pipe_whisper = pipeline(
+    "automatic-speech-recognition",
+    model=whisper_model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    torch_dtype=torch.float16,
+    return_timestamps=True,
+    device=device,
+)
+# Load MMS model
+model_id = "facebook/mms-1b-all"
+processor_mms = AutoProcessor.from_pretrained(model_id)
+mms_model = Wav2Vec2ForCTC.from_pretrained(model_id)
+mms_model = mms_model.to(device)
+mms_model.eval()
+processor_mms.tokenizer.set_target_lang("mkd")
+mms_model.load_adapter("mkd")
+# Create a partial function with the device pre-applied
+return_prediction_whisper_with_device = partial(return_prediction_whisper, device=device)
+return_prediction_w2v2_with_device = partial(return_prediction_w2v2, device=device)
+return_prediction_with_device_compare = partial(return_prediction_compare, device=device)
+# Load the ASR models
+w2v2_classifier = foreign_class(source="Macedonian-ASR/wav2vec2-aed-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
+w2v2_classifier = w2v2_classifier.to(device)
+w2v2_classifier.eval()
+whisper_classifier = foreign_class(source="Macedonian-ASR/whisper-large-v3-macedonian-asr", pymodule_file="custom_interface_app.py", classname="ASR")
+whisper_classifier = whisper_classifier.to(device)
+whisper_classifier.eval()
+# Load the T5 tokenizer and model for restoring capitalization
+recap_model_name = "Macedonian-ASR/mt5-restore-capitalization-macedonian"
+recap_tokenizer = T5Tokenizer.from_pretrained(recap_model_name)
+recap_model = T5ForConditionalGeneration.from_pretrained(recap_model_name, torch_dtype=torch.float16)
+recap_model.to(device)
+recap_model.eval()
+mic_transcribe_whisper = gr.Interface(
+    fn=return_prediction_whisper_with_device,
+    inputs=gr.Audio(sources="microphone", type="filepath"),
+    outputs=gr.Textbox(),
+    allow_flagging="never",
+    live=True,
+)
+# file_transcribe_whisper = gr.Interface(
+#     fn=return_prediction_whisper_with_device,
+#     inputs=gr.Audio(sources="upload", type="filepath"),
+#     outputs=gr.Textbox(),
+#     allow_flagging="never",
+#     live=True
+# )
+mic_transcribe_w2v2 = gr.Interface(
+    fn=return_prediction_w2v2_with_device,
+    inputs=gr.Audio(sources="microphone", type="filepath"),
+    outputs=gr.Textbox(),
+    allow_flagging="never",
+    live=True,
+)
+# file_transcribe_w2v2 = gr.Interface(
+#     fn=return_prediction_w2v2_with_device,
+#     inputs=gr.Audio(sources="upload", type="filepath"),
+#     outputs=gr.Textbox(),
+#     allow_flagging="never",
+#     live=True
+# )
+mic_transcribe_compare = gr.Interface(
+    fn=return_prediction_with_device_compare,
+    inputs=gr.Audio(sources="microphone", type="filepath"),
+    outputs=gr.Textbox(),
+    allow_flagging="never",
+    live=True,
+)
+# file_transcribe_compare = gr.Interface(
+#     fn=return_prediction_with_device_compare,
+#     inputs=gr.Audio(sources="upload", type="filepath"),
+#     outputs=gr.Textbox(),
+#     allow_flagging="never",
+#     live=True
+# )
+project_description = '''
+## Автори:
+1. **Дејан Порјазовски**
+2. **Илина Јакимовска**
+3. **Ордан Чукалиев**
+4. **Никола Стиков**
+Оваа колаборација е дел од активностите на **Центарот за напредни интердисциплинарни истражувања ([ЦеНИИс](https://ukim.edu.mk/en/centri/centar-za-napredni-interdisciplinarni-istrazhuvanja-ceniis))** при УКИМ.
+## Во тренирањето на овој модел се употребени податоци од:
+1. Дигитален архив за етнолошки и антрополошки ресурси (ДАЕАР) при Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
+2. Аудио верзија на меѓународното списание „ЕтноАнтропоЗум“ на Институтот за етнологија и антропологија, Природно-математички факултет при УКИМ.
+3. Аудио подкастот „Обични луѓе“ на Илина Јакимовска
+4. Научните видеа од серијалот „Наука за деца“, фондација КАНТАРОТ
+5. Македонска верзија на Mozilla Common Voice (верзија 18.0)
+'''
+# Custom CSS
+css = """
+.gradio-container {
+    background-color: #f0f0f0;  /* Set your desired background color */
+}
+.custom-markdown p, .custom-markdown li, .custom-markdown h2, .custom-markdown a {
+    font-size: 15px !important;
+    font-family: Arial, sans-serif !important;
+}
+.gradio-container {
+    background-color: #f3f3f3 !important;
+}
+"""
+transcriber_app = gr.Blocks(css=css, delete_cache=(60, 120))
+with transcriber_app:
+    state = gr.State()
+    gr.Markdown(project_description, elem_classes="custom-markdown")
+    # gr.TabbedInterface(
+    #     [mic_transcribe_whisper, mic_transcribe_compare],
+    #     ["Буки-Whisper транскрипција", "Споредба на модели"],
+    # )
+    # state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))
+    gr.TabbedInterface(
+        [mic_transcribe_whisper, mic_transcribe_w2v2, mic_transcribe_compare],
+        ["Буки-Whisper транскрипција", "Буки-W2v2 транскрипција", "Споредба на модели"],
+    )
+    state = gr.State(value=[], delete_callback=lambda v: print("STATE DELETED"))
+    transcriber_app.unload(return_prediction_whisper)
+    # transcriber_app.unload(return_prediction_compare)
+# transcriber_app.launch(debug=True, share=True, ssl_verify=False)
+if __name__ == "__main__":
+    transcriber_app.queue()
+    transcriber_app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+speechbrain
+transformers
+librosa
+whisper_timestamped
+accelerate