|
from datasets import load_dataset, Dataset |
|
from transformers import pipeline |
|
import evaluate |
|
import numpy as np |
|
import gradio as gr |
|
import json |
|
from pathlib import Path |
|
import re |
|
|
|
|
|
|
|
wer_metric = evaluate.load("wer") |
|
|
|
model_name = { |
|
"whisper-tiny": "openai/whisper-tiny.en", |
|
"wav2vec2-large-960h": "facebook/wav2vec2-base-960h", |
|
"distill-whisper-small": "distil-whisper/distil-small.en", |
|
} |
|
|
|
|
|
with open("ds_data.json", "r") as f: |
|
table_data = json.load(f) |
|
|
|
def clean_text(text): |
|
return re.sub(r'[.,!?]', '', text) |
|
|
|
def compute_wer_table(audio, text): |
|
|
|
audio_input = audio[1] |
|
audio_input = audio_input.astype(np.float32) |
|
audio_input = audio_input / 32767 |
|
|
|
trans = [] |
|
wer_scores = [] |
|
remove_chars = str.maketrans('', '', '.,!?') |
|
|
|
for model in model_name: |
|
pipe = pipeline("automatic-speech-recognition", model=model_name[model]) |
|
transcription = pipe(audio_input)['text'] |
|
|
|
transcription = clean_text(transcription) |
|
trans.append(transcription) |
|
wer = wer_metric.compute(predictions=[transcription.upper()], references=[text.upper()]) |
|
wer_scores.append(wer) |
|
|
|
result = [[model, t, s] for model, t, s in zip(model_name.keys(), trans, wer_scores)] |
|
|
|
return result |
|
|
|
with gr.Blocks() as demo: |
|
with gr.Tab("Docs"): |
|
gr.Markdown((Path(__file__).parent / "demo.md").read_text()) |
|
with gr.Tab("Demo"): |
|
gr.Interface( |
|
fn=compute_wer_table, |
|
inputs=[ |
|
gr.Audio(label="Input Audio"), |
|
gr.Textbox(label="Reference Text") |
|
], |
|
outputs=gr.Dataframe(headers=["Model", "Transcription", "WER"], label="WER Results"), |
|
examples=[[f"assets/output_audio_{i}.wav", table_data[i]['reference']] for i in range(100)], |
|
title="ASR Model Evaluation", |
|
description=( |
|
"This application allows you to evaluate the performance of various Automatic Speech Recognition (ASR) models on " |
|
"a given audio sample. Simply provide an audio file and the corresponding reference text, and the app will compute " |
|
"the Word Error Rate (WER) for each model. The results will be presented in a table that includes the model name, " |
|
"the transcribed text, and the calculated WER. " |
|
"\n\n### Table of Results\n" |
|
"The table below shows the transcriptions generated by different ASR models, along with their corresponding WER scores. " |
|
"Lower WER scores indicate better performance." |
|
"\n\n| Model | WER |\n" |
|
"|--------------------------|--------------------------|\n" |
|
"| [whisper-tiny](https://huggingface.co/openai/whisper-tiny.en) | 0.05511 |\n" |
|
"| [wav2vec2-large-960h](https://huggingface.co/facebook/wav2vec2-large-960h) | 0.01617 |\n" |
|
"| [distill-whisper-small](https://huggingface.co/distil-whisper/distil-small.en)| 0.03686 |\n" |
|
"\n\n### Data Source\n" |
|
"The data used in this demo is a subset of the [LibriSpeech](https://huggingface.co/datasets/openslr/librispeech_asr) dataset which contains the first 100 audio samples and their corresponding reference texts in the validation set." |
|
), |
|
) |
|
|
|
demo.launch() |
|
|