AudioBench-Leaderboard / process_log.py
He Yingxu
breakdown asr sea
2330259
raw
history blame
4.4 kB
import os
import re
import sys
import json
import random
import pandas as pd
import numpy as np
from app.content import *
data_to_df = []
log_dir = "path/to/audiobench/log"
all_evaluated_models = os.listdir(log_dir)
for model_name in all_evaluated_models:
if "geyu_whisper" in model_name:
continue
if "activation_checkpointing" in model_name:
continue
model_dir = os.path.join(log_dir, model_name)
if not os.path.isdir(model_dir):
continue
for log_file in os.listdir(model_dir):
if not log_file.endswith("score.json"):
continue
match = re.match("^(.*?)_(llama3_70b_judge|wer|bleu)_score.json$", log_file)
ds_name = match.group(1)
metrics = match.group(2)
eval_path = os.path.join(model_dir, log_file)
with open(eval_path, "r") as f:
eval_data = json.load(f)
if metrics == "llama3_70b_judge":
value = eval_data[metrics]["judge_score"]
elif metrics == "wer":
value = eval_data[metrics]
elif metrics == "bleu":
value = eval_data[metrics]
data_to_df.append([model_name, ds_name, metrics, value])
eval_result_df = pd.DataFrame(data_to_df, columns=["model", "dataset", "metrics", "value"])
eval_result_df["model"] = eval_result_df["model"].replace("MERaLiON_AudioLLM_v1_hf", "MERaLiON-AudioLLM-Whisper-SEA-LION")
# original results_organized
archive_results_dir = "results_organized_archive"
output_results_dir = "results_organized"
def merge_results(display_datasets, metrics, result_sub_path=None):
raw_ds_names = [displayname2datasetname[dis_name] for dis_name in display_datasets]
new_result = eval_result_df[eval_result_df["dataset"].isin(raw_ds_names) & (eval_result_df["metrics"] == metrics)]
new_result = new_result.drop(columns=["metrics"])
new_result = new_result.pivot(index="model", columns="dataset", values="value").reset_index()
new_result = new_result.rename(columns={"model": "Model"})
new_result = new_result.dropna(axis=0, how="any")
archive_result_path = os.path.join(archive_results_dir, result_sub_path)
if os.path.exists(archive_result_path):
archive_result = pd.read_csv(archive_result_path)
archive_columns = [col for col in archive_result.columns if col in raw_ds_names]
archive_result = archive_result[["Model"] + archive_columns]
combined_result = pd.concat([archive_result, new_result], ignore_index=True)
combined_result = combined_result.drop_duplicates(subset=["Model"], keep="last", ignore_index=True)
return new_result, combined_result
return new_result, new_result
result_file_mapper = {
"bleu/st.csv": speech_translation_datasets,
"llama3_70b_judge/accent_recognition.csv": ar_datasets,
"llama3_70b_judge/audio_captioning.csv": ac_datasets,
"llama3_70b_judge/audio_scene_question_answering.csv": asqa_datasets,
"llama3_70b_judge/emotion_recognition.csv": er_datasets,
"llama3_70b_judge/gender_recognition.csv": gr_datasets,
"llama3_70b_judge/music_understanding.csv": music_datasets,
"llama3_70b_judge/sds_singlish.csv": sds_datasets,
"llama3_70b_judge/speech_instruction.csv": si_datasets,
"llama3_70b_judge/sqa_english.csv": speech_qa_english_datasets,
"llama3_70b_judge/sqa_singlish.csv": speech_qa_singlish_datasets,
"llama3_70b_judge/under_development_llama3_70b_judge.csv": non_wer_development_datasets,
"meteor/audio_captioning.csv": ac_datasets,
"wer/asr_english.csv": asr_english_datasets,
"wer/asr_singlish.csv": asr_singlish_datasets,
"wer/asr_mandarin.csv": asr_mandarin_datasets,
"wer/asr_malay.csv": asr_malay_datasets,
"wer/asr_tamil.csv": asr_tamil_datasets,
"wer/asr_indonesian.csv": asr_indonesian_datasets,
"wer/asr_thai.csv": asr_thai_datasets,
"wer/asr_vietnamese.csv": asr_vietnamese_datasets,
"wer/asr_private.csv": asr_private_datasets,
"wer/under_development_wer.csv": wer_development_datasets,
}
for sub_path, display_ds in result_file_mapper.items():
metrics = sub_path.split("/")[0]
new_result, combined_result = merge_results(display_ds, metrics, sub_path)
output_path = os.path.join(output_results_dir, sub_path)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
combined_result.to_csv(output_path, index=False)