Spaces:

justus-tobias
/

ASR_Model_Comparison

Paused

App Files Files Community

j-tobias commited on Aug 12, 2024

Commit

09b2769

1 Parent(s): e3bf44e

updated backend

Browse files

Files changed (10) hide show

.codetogether.ignore +0 -1
__pycache__/dataset.cpython-310.pyc +0 -0
__pycache__/model.cpython-310.pyc +0 -0
__pycache__/processing.cpython-310.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
app.py +42 -63
dataset.py +0 -93
model.py +0 -122
processing.py +194 -0
utils.py +11 -0

.codetogether.ignore DELETED Viewed

	@@ -1 +0,0 @@
1	- credentials.json

__pycache__/dataset.cpython-310.pyc ADDED Viewed

Binary file (3.34 kB). View file

__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (3.74 kB). View file

__pycache__/processing.cpython-310.pyc ADDED Viewed

Binary file (4.24 kB). View file

__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (1.04 kB). View file

app.py CHANGED Viewed

@@ -1,9 +1,5 @@
 import gradio as gr
-from dataset import Dataset
-from model import Model
-from utils import compute_wer
-import plotly.graph_objs as go
 # from utils import hf_login
 # hf_login()
@@ -14,73 +10,58 @@ import os
 hf_token = os.getenv("HF_Token")
 login(hf_token)
-dataset = Dataset()
-models = Model()
-def run_tests (dataset_choice:str, model:str):
-    MoDeL = Model()
-    MoDeL.select(model)
-    MoDeL.load()
-    DaTaSeT = Dataset(100)
-    DaTaSeT.load(dataset_choice)
-    references, predictions = MoDeL.process(DaTaSeT)
-    wer = compute_wer(references=references, predictions=predictions)
-    return wer
-def eval(data_subset:str, model_1:str, model_2:str)->str:
-    wer_result_1 = run_tests(data_subset, model_1)
-    wer_result_2 = run_tests(data_subset, model_2)
-    results_md = f"""#### {model_1}
-    - WER Score: {wer_result_1}
-    #### {model_2}
-    - WER Score: {wer_result_2}"""
-    # Create the bar plot
-    fig = go.Figure(
-        data=[
-            go.Bar(x=[f"{model_1}"], y=[wer_result_1]),
-            go.Bar(x=[f"{model_2}"], y=[wer_result_2]),
-        ]
-    )
-    # Update the layout for better visualization
-    fig.update_layout(
-        title="Comparison of Two Models",
-        xaxis_title="Models",
-        yaxis_title="Value",
-        barmode="group",
-    )
-    return results_md, fig
 def get_card(selected_model:str)->str:
-    print("Selected Model for Card: ", selected_model)
     with open("cards.txt", "r") as f:
         cards = f.read()
-    print(cards)
     cards = cards.split("@@")
     for card in cards:
-        print("CARD: ", card)
         if "ID: "+selected_model in card:
             return card
     return "Unknown Model"
-def is_own(data_subset:str):
-    if data_subset == "own":
-        own_audio = gr.Audio(sources=['microphone'],streaming=False)
-        own_transcription = gr.TextArea(lines=2)
-        return own_audio, own_transcription
-    own_audio = None
-    own_transcription = None
-    return own_audio, own_transcription
 with gr.Blocks() as demo:
@@ -106,31 +87,29 @@ Happy experimenting and comparing! 🚀""")
             pass
         with gr.Column(scale=5):
             data_subset = gr.Radio(
-                value="LibriSpeech Clean",
-                choices=dataset.get_options(),
                 label="Data subset / Own Sample",
             )
         with gr.Column(scale=1):
             pass
-    with gr.Row():
-        own_audio = gr.Audio(sources=['microphone'],streaming=False,visible=False)
-        own_transcription = gr.TextArea(lines=2, visible=False)
-        data_subset.change(is_own, inputs=[data_subset], outputs=[own_audio, own_transcription])
     with gr.Row():
         with gr.Column(scale=1):
             model_1 = gr.Dropdown(
-                choices=models.get_options(),
                 label="Select Model"
             )
             model_1_card = gr.Markdown("")
         with gr.Column(scale=1):
             model_2 = gr.Dropdown(
-                choices=models.get_options(),
                 label="Select Model"
             )
             model_2_card = gr.Markdown("")
@@ -148,6 +127,6 @@ Happy experimenting and comparing! 🚀""")
     gr.Markdown('## <p style="text-align: center;">Results</p>')
     results_md = gr.Markdown("")
     results_plot = gr.Plot(show_label=False)
-    eval_btn.click(eval, [data_subset, model_1, model_2], [results_md, results_plot])
 demo.launch(debug=True)

 import gradio as gr
+from processing import run
 # from utils import hf_login
 # hf_login()
 hf_token = os.getenv("HF_Token")
 login(hf_token)
+MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr"]
+DATASET_OPTIONS = ["Common Voice", "VoxPopuli", "OWN Recoding/Sample"]
+# def eval(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:str)->str:
+#     print("OWN AUDIO: ", type(own_audio), own_audio)
+#     wer_result_1, wer_result_2, references, transcriptions1, transcriptions2 = run(data_subset, model_1, model_2, own_audio, own_transcription)
+#     results_md = f"""#### {model_1}
+#     - WER Score: {wer_result_1}
+#     #### {model_2}
+#     - WER Score: {wer_result_2}"""
+#     # Create the bar plot
+#     fig = go.Figure(
+#         data=[
+#             go.Bar(x=[f"{model_1}"], y=[wer_result_1]),
+#             go.Bar(x=[f"{model_2}"], y=[wer_result_2]),
+#         ]
+#     )
+#     # Update the layout for better visualization
+#     fig.update_layout(
+#         title="Comparison of Two Models",
+#         xaxis_title="Models",
+#         yaxis_title="Value",
+#         barmode="group",
+#     )
+#     return results_md, fig
 def get_card(selected_model:str)->str:
     with open("cards.txt", "r") as f:
         cards = f.read()
     cards = cards.split("@@")
     for card in cards:
         if "ID: "+selected_model in card:
             return card
     return "Unknown Model"
+def is_own(selected_option):
+    if selected_option == "OWN Recoding/Sample":
+        return gr.update(visible=True), gr.update(visible=True)
+    else:
+        return gr.update(visible=False), gr.update(visible=False)
 with gr.Blocks() as demo:
             pass
         with gr.Column(scale=5):
             data_subset = gr.Radio(
+                value="Common Voice",
+                choices=DATASET_OPTIONS,
                 label="Data subset / Own Sample",
             )
+            own_audio = gr.Audio(visible=False)
+            own_transcription = gr.TextArea(lines=2, visible=False)
+            data_subset.change(is_own, inputs=[data_subset], outputs=[own_audio, own_transcription])
         with gr.Column(scale=1):
             pass
     with gr.Row():
         with gr.Column(scale=1):
             model_1 = gr.Dropdown(
+                choices=MODEL_OPTIONS,
                 label="Select Model"
             )
             model_1_card = gr.Markdown("")
         with gr.Column(scale=1):
             model_2 = gr.Dropdown(
+                choices=MODEL_OPTIONS,
                 label="Select Model"
             )
             model_2_card = gr.Markdown("")
     gr.Markdown('## <p style="text-align: center;">Results</p>')
     results_md = gr.Markdown("")
     results_plot = gr.Plot(show_label=False)
+    eval_btn.click(run, [data_subset, model_1, model_2, own_audio, own_transcription], [results_md, results_plot])
 demo.launch(debug=True)

dataset.py DELETED Viewed

@@ -1,93 +0,0 @@
-from datasets import load_dataset
-from datasets import Audio
-class Dataset:
-    def __init__(self, n:int = 100):
-        self.n = n
-        self.options = ['LibriSpeech Clean', 'LibriSpeech Other', 'Common Voice', 'VoxPopuli', 'TEDLIUM', 'GigaSpeech', 'SPGISpeech', 'AMI', 'OWN']
-        self.selected = None
-        self.dataset = None
-        self.text = None
-    def get_options(self):
-        return self.options
-    def _check_text(self):
-        sample = next(iter(self.dataset))
-        print(sample)
-        self._get_text(sample)
-    def _get_text(self, sample):
-        if "text" in sample:
-            self.text = "text"
-            return sample["text"]
-        elif "sentence" in sample:
-            self.text = "sentence"
-            return sample["sentence"]
-        elif "normalized_text" in sample:
-            self.text = "normalized_text"
-            return sample["normalized_text"]
-        elif "transcript" in sample:
-            self.text = "transcript"
-            return sample["transcript"]
-        else:
-            raise ValueError(f"Sample: {sample.keys()} has no transcript.")
-    def filter(self, input_column:str = None):
-        if input_column is None:
-            if self.text is not None:
-                input_column = self.text
-            else:
-                input_column = self._check_text()
-        def is_target_text_in_range(ref):
-            if ref.strip() == "ignore time segment in scoring":
-                return False
-            else:
-                return ref.strip() != ""
-        self.dataset = self.dataset.filter(is_target_text_in_range, input_columns=[input_column])
-        return self.dataset
-    def normalised(self, normalise):
-        self.dataset = self.dataset.map(normalise)
-    def _select(self, option:str):
-        if option not in self.options:
-            raise ValueError(f"This value is not an option, please see: {self.options}")
-        self.selected = option
-    def _preprocess(self):
-        self.dataset = self.dataset.take(self.n)
-        self.dataset = self.dataset.cast_column("audio", Audio(sampling_rate=16000))
-    def load(self, option:str = None):
-        self._select(option)
-        if option == "OWN":
-            pass
-        elif option == "LibriSpeech Clean":
-            self.dataset = load_dataset("librispeech_asr", "all", split="test.clean", streaming=True)
-        elif option == "LibriSpeech Other":
-            self.dataset = load_dataset("librispeech_asr", "all", split="test.other", streaming=True)
-        elif option == "Common Voice":
-            self.dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", revision="streaming", split="test", streaming=True, token=True, trust_remote_code=True)
-        elif option == "VoxPopuli":
-            self.dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True)
-        elif option == "TEDLIUM":
-            self.dataset = load_dataset("LIUM/tedlium", "release3", split="test", streaming=True, trust_remote_code=True)
-        elif option == "GigaSpeech":
-            self.dataset = load_dataset("speechcolab/gigaspeech", "xs", split="test", streaming=True, token=True, trust_remote_code=True)
-        elif option == "SPGISpeech":
-            self.dataset = load_dataset("kensho/spgispeech", "S", split="test", streaming=True, token=True, trust_remote_code=True)
-        elif option == "AMI":
-            self.dataset = load_dataset("edinburghcstr/ami", "ihm", split="test", streaming=True, trust_remote_code=True)
-        self._preprocess()

model.py DELETED Viewed

@@ -1,122 +0,0 @@
-# from transformers import WhisperProcessor, WhisperForConditionalGeneration
-from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
-from transformers import pipeline
-# import nemo.collections.asr as nemo_asr
-from dataset import Dataset
-from utils import data
-class Model:
-    def __init__(self):
-        self.options = [
-            "openai/whisper-tiny.en",
-            "facebook/s2t-medium-librispeech-asr",
-            #"nvidia/stt_en_fastconformer_ctc_large"
-        ]
-        self.selected = None
-        self.pipeline = None
-        self.normalize = None
-    def get_options(self):
-        return self.options
-    def load(self, option:str = None):
-        if option is None:
-            if self.selected is None:
-                raise ValueError("No model selected. Please first select a model")
-            option = self.selected
-        if option not in self.options:
-            raise ValueError(f"Selected Option is not a valid value, see: {self.options}")
-        if option == "openai/whisper-tiny.en":
-            self.pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en")
-            self.normalize = self.pipeline.tokenizer.normalize
-        elif option == "facebook/s2t-medium-librispeech-asr":
-            self.model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr")
-            self.processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr", do_upper_case=True)
-        # elif option == "nvidia/stt_en_fastconformer_ctc_large":
-        #     self.model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="nvidia/stt_en_fastconformer_ctc_large")
-    def select(self, option:str=None):
-        if option not in self.options:
-            raise ValueError(f"This value is not an option, please see: {self.options}")
-        self.selected = option
-    def process(self, dataset:Dataset):
-        if self.selected is None:
-            raise ValueError("No Model is yet selected. Please select a model first")
-        if self.selected == "openai/whisper-tiny.en":
-            references, predictions = self._process_openai_whisper_tiny_en(dataset)
-        elif self.selected == "facebook/s2t-medium-librispeech-asr":
-            references, predictions = self._process_facebook_s2t_medium(dataset)
-        # elif self.selected == "nvidia/stt_en_fastconformer_ctc_large":
-        #     references, predictions = self._process_facebook_s2t_medium(dataset)
-        return references, predictions
-    def _process_openai_whisper_tiny_en(self, DaTaSeT:Dataset):
-        def normalise(batch):
-            batch["norm_text"] = self.normalize(DaTaSeT._get_text(batch))
-            return batch
-        DaTaSeT.normalised(normalise)
-        dataset = DaTaSeT.filter("norm_text")
-        predictions = []
-        references = []
-        # run streamed inference
-        for out in self.pipeline(data(dataset), batch_size=16):
-            predictions.append(self.normalize(out["text"]))
-            references.append(out["reference"][0])
-        return references, predictions
-    def _process_facebook_s2t_medium(self, DaTaSeT:Dataset):
-        def map_to_pred(batch):
-            features = self.processor(batch["audio"]["array"], sampling_rate=16000, padding=True, return_tensors="pt")
-            input_features = features.input_features
-            attention_mask = features.attention_mask
-            gen_tokens = self.model.generate(input_features=input_features, attention_mask=attention_mask)
-            batch["transcription"] = self.processor.batch_decode(gen_tokens, skip_special_tokens=True)[0]
-            return batch
-        DaTaSeT.dataset = DaTaSeT.dataset.take(100)
-        result = DaTaSeT.dataset.map(map_to_pred, remove_columns=["audio"])
-        predictions = []
-        references = []
-        DaTaSeT._check_text()
-        text_column = DaTaSeT.text
-        for sample in result:
-            predictions.append(sample['transcription'])
-            references.append(sample[text_column])
-        return references, predictions
-    def _process_stt_en_fastconformer_ctc_large(self, DaTaSeT:Dataset):
-        self.model.transcribe(['2086-149220-0033.wav'])
-        predictions = []
-        references = []
-        return references, predictions

processing.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
+import plotly.graph_objs as go
+from datasets import load_dataset
+from datasets import Audio
+from transformers import pipeline
+import evaluate
+import librosa
+import numpy as np
+wer_metric = evaluate.load("wer")
+def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:str):
+    if data_subset is None:
+        raise ValueError("No Dataset selected")
+    if model_1 is None:
+        raise ValueError("No Model 1 selected")
+    if model_2 is None:
+        raise ValueError("No Model 2 selected")
+    if data_subset == "Common Voice":
+        dataset, text_column = load_Common_Voice()
+    elif data_subset == "VoxPopuli":
+        dataset, text_column = load_Vox_Populi()
+    elif data_subset == "OWN Recoding/Sample":
+        sr, audio = own_audio
+        audio = audio.astype(np.float32) / 32768.0
+        print("AUDIO: ", type(audio), audio)
+        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+    else:
+        # if data_subset is None then still load load_Common_Voice
+        dataset, text_column = load_Common_Voice()
+    print("Dataset Loaded")
+    # check if models are the same
+    model1, processor1 = load_model(model_1)
+    model2, processor2 = load_model(model_2)
+    print("Models Loaded")
+    if data_subset == "OWN Recoding/Sample":
+        sample = {"audio":{"array":audio,"sampling_rate":16000}}
+        transcription1 = model_compute(model1, processor1, sample, model_1)
+        transcription2 = model_compute(model2, processor2, sample, model_2)
+        transcriptions1 = [transcription1]
+        transcriptions2 = [transcription2]
+        references = [own_transcription]
+        wer1 = compute_wer(references, transcriptions1)
+        wer2 = compute_wer(references, transcriptions2)
+        results_md = f"""#### {model_1}
+        - WER Score: {wer1}
+        #### {model_2}
+        - WER Score: {wer2}"""
+        # Create the bar plot
+        fig = go.Figure(
+            data=[
+                go.Bar(x=[f"{model_1}"], y=[wer1]),
+                go.Bar(x=[f"{model_2}"], y=[wer2]),
+            ]
+        )
+        # Update the layout for better visualization
+        fig.update_layout(
+            title="Comparison of Two Models",
+            xaxis_title="Models",
+            yaxis_title="Value",
+            barmode="group",
+        )
+        yield results_md, fig
+    else:
+        references = []
+        transcriptions1 = []
+        transcriptions2 = []
+        counter = 0
+        for sample in dataset:
+            print(counter)
+            counter += 1
+            references.append(sample[text_column])
+            if model_1 == model_2:
+                transcription = model_compute(model1, processor1, sample, model_1)
+                transcriptions1.append(transcription)
+                transcriptions2.append(transcription)
+            else:
+                transcriptions1.append(model_compute(model1, processor1, sample, model_1))
+                transcriptions2.append(model_compute(model2, processor2, sample, model_2))
+            wer1 = compute_wer(references, transcriptions1)
+            wer2 = compute_wer(references, transcriptions2)
+            results_md = f"""#### {model_1}
+            - WER Score: {wer1}
+            #### {model_2}
+            - WER Score: {wer2}"""
+            # Create the bar plot
+            fig = go.Figure(
+                data=[
+                    go.Bar(x=[f"{model_1}"], y=[wer1]),
+                    go.Bar(x=[f"{model_2}"], y=[wer2]),
+                ]
+            )
+            # Update the layout for better visualization
+            fig.update_layout(
+                title="Comparison of Two Models",
+                xaxis_title="Models",
+                yaxis_title="Value",
+                barmode="group",
+            )
+            yield results_md, fig
+# DATASET LOADERS
+def load_Common_Voice():
+    dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", revision="streaming", split="test", streaming=True, token=True, trust_remote_code=True)
+    text_column = "sentence"
+    dataset = dataset.take(100)
+    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+    dataset = list(dataset)
+    return dataset, text_column
+def load_Vox_Populi():
+    dataset = dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True)
+    print(next(iter(dataset)))
+    text_column = "raw_text"
+    dataset = dataset.take(100)
+    dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+    dataset = list(dataset)
+    return dataset, text_column
+# MODEL LOADERS
+def load_model(model_id:str):
+    if model_id == "openai/whisper-tiny.en":
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+    elif model_id == "facebook/s2t-medium-librispeech-asr":
+        model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr")
+        processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr", do_upper_case=True)
+    else:
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+    return model, processor
+# MODEL INFERENCE
+def model_compute(model, processor, sample, model_id):
+    if model_id == "openai/whisper-tiny.en":
+        sample = sample["audio"]
+        input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
+        predicted_ids = model.generate(input_features)
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+        return transcription[0]
+    elif model_id == "facebook/s2t-medium-librispeech-asr":
+        sample = sample["audio"]
+        features = processor(sample["array"], sampling_rate=16000, padding=True, return_tensors="pt")
+        input_features = features.input_features
+        attention_mask = features.attention_mask
+        gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask)
+        transcription= processor.batch_decode(gen_tokens, skip_special_tokens=True)[0]
+        return transcription[0]
+    else:
+        return model(sample)
+# UTILS
+def compute_wer(references, predictions):
+    wer = wer_metric.compute(references=references, predictions=predictions)
+    wer = round(100 * wer, 2)
+    return wer
+# print(load_Vox_Populi())
+# print(run("Common Voice", "openai/whisper-tiny.en", "openai/whisper-tiny.en", None, None))

utils.py CHANGED Viewed

	@@ -22,3 +22,14 @@ def compute_wer(references, predictions):
22	return wer
23
24

     return wer
+# def run_tests (dataset_choice:str, model:str):
+#     MoDeL = Model()
+#     MoDeL.select(model)
+#     MoDeL.load()
+#     DaTaSeT = Dataset(100)
+#     DaTaSeT.load(dataset_choice)
+#     references, predictions = MoDeL.process(DaTaSeT)
+#     wer = compute_wer(references=references, predictions=predictions)
+#     return wer