| import streamlit as st | |
| from app.draw_diagram import draw_table | |
| from app.content import * | |
| from app.summarization import sum_table_mulit_metrix | |
| def dataset_contents(dataset, metrics): | |
| custom_css = """ | |
| <style> | |
| .my-dataset-info { | |
| # background-color: #F9EBEA; | |
| # padding: 10px; | |
| color: #050505; | |
| font-style: normal; | |
| font-size: 8px; | |
| height: auto; | |
| } | |
| </style> | |
| """ | |
| st.markdown(custom_css, unsafe_allow_html=True) | |
| st.markdown(f"""<div class="my-dataset-info"> | |
| <p><b>About this dataset</b>: {dataset}</p> | |
| </div>""", unsafe_allow_html=True) | |
| st.markdown(f"""<div class="my-dataset-info"> | |
| <p><b>About this metric</b>: {metrics}</p> | |
| </div>""", unsafe_allow_html=True) | |
| def dashboard(): | |
| with st.container(): | |
| st.title("Leaderboard for AudioBench") | |
| st.markdown(""" | |
| [gh1]: https://github.com/AudioLLMs/AudioBench | |
| [gh2]: https://github.com/AudioLLMs/Awesome-Audio-LLM | |
| **Toolkit:** [][gh1] | | |
| [**Paper @ NAACL 2025**](https://arxiv.org/abs/2406.16020) | | |
| **Resource for AudioLLMs:** [][gh2] | |
| """) | |
| st.markdown(""" | |
| #### Recent updates | |
| - **Jan. 2025**: AudioBench is officially accepted to NAACL 2025! | |
| - **Jan. 2025**: Update the layout. | |
| - **Dec. 2024**: Added MuChoMusic dataset for Music Understanding - MCQ Questions. From Paper: https://arxiv.org/abs/2408.01337. | |
| - **Dec. 2024**: Singlish ASR task added! The datasets are available on [HF](https://huggingface.co/datasets/MERaLiON/MNSC). | |
| - **Dec. 2024**: Updated layout and added support for comparison between models with similar sizes. 1) Reorganized layout for a better user experience. 2) Added performance summary for each task. | |
| - **Aug. 2024**: Initial leaderboard is now online. | |
| """) | |
| st.divider() | |
| st.markdown(""" | |
| #### Evaluating Audio-based Large Language Models | |
| - AudioBench is a comprehensive evaluation benchmark designed for general instruction-following audio large language models. | |
| - AudioBench is an evaluation benchmark that we continually improve and maintain. | |
| Below are the initial 26 datasets that are included in AudioBench. We are now exteneded to over 50 datasets and going to extend to more in the future. | |
| """ | |
| ) | |
| with st.container(): | |
| st.markdown(''' | |
| ''') | |
| st.markdown("###### :dart: Our Benchmark includes: ") | |
| cols = st.columns(8) | |
| cols[0].metric(label="Tasks", value=">10") | |
| cols[1].metric(label="Datasets", value=">50") | |
| cols[2].metric(label="Evaluated Models", value=">10") | |
| st.divider() | |
| with st.container(): | |
| left_co, right_co = st.columns([1, 0.1]) | |
| with left_co: | |
| st.markdown(""" | |
| ##### Citations :round_pushpin: | |
| ``` | |
| @article{wang2024audiobench, | |
| title={AudioBench: A Universal Benchmark for Audio Large Language Models}, | |
| author={Wang, Bin and Zou, Xunlong and Lin, Geyu and Sun, Shuo and Liu, Zhuohan and Zhang, Wenyu and Liu, Zhengyuan and Aw, AiTi and Chen, Nancy F}, | |
| journal={NAACL}, | |
| year={2025} | |
| } | |
| ``` | |
| ``` | |
| @article{zhang2024mowe, | |
| title={MoWE-Audio: Multitask AudioLLMs with Mixture of Weak Encoders}, | |
| author={Zhang, Wenyu and Sun, Shuo and Wang, Bin and Zou, Xunlong and Liu, Zhuohan and He, Yingxu and Lin, Geyu and Chen, Nancy F and Aw, Ai Ti}, | |
| journal={ICASSP}, | |
| year={2025} | |
| } | |
| ``` | |
| ``` | |
| @article{wang2025advancing, | |
| title={Advancing Singlish Understanding: Bridging the Gap with Datasets and Multimodal Models}, | |
| author={Wang, Bin and Zou, Xunlong and Sun, Shuo and Zhang, Wenyu and He, Yingxu and Liu, Zhuohan and Wei, Chengwei and Chen, Nancy F and Aw, AiTi}, | |
| journal={arXiv preprint arXiv:2501.01034}, | |
| year={2025} | |
| } | |
| ``` | |
| ``` | |
| @article{he2024meralion, | |
| title={MERaLiON-AudioLLM: Technical Report}, | |
| author={He, Yingxu and Liu, Zhuohan and Sun, Shuo and Wang, Bin and Zhang, Wenyu and Zou, Xunlong and Chen, Nancy F and Aw, Ai Ti}, | |
| journal={arXiv preprint arXiv:2412.09818}, | |
| year={2024} | |
| } | |
| ``` | |
| """) | |
| def asr_english(): | |
| st.title("Task: Automatic Speech Recognition - English") | |
| sum = ['Overall'] | |
| dataset_list = [ | |
| 'LibriSpeech-Clean', | |
| 'LibriSpeech-Other', | |
| 'CommonVoice-15-EN', | |
| 'Peoples-Speech', | |
| 'GigaSpeech-1', | |
| 'Earnings-21', | |
| 'Earnings-22', | |
| 'TED-LIUM-3', | |
| 'TED-LIUM-3-LongForm', | |
| ] | |
| filters_1_list = sum + dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['WER']) | |
| metric = metric.lower() | |
| if tab_section: | |
| if tab_section in sum: | |
| sum_table_mulit_metrix(dataset_list, metric) | |
| else: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def asr_singlish(): | |
| st.title("Task: Automatic Speech Recognition - Singlish") | |
| sum = ['Overall'] | |
| dataset_list = [ | |
| 'MNSC-PART1-ASR', | |
| 'MNSC-PART2-ASR', | |
| 'MNSC-PART3-ASR', | |
| 'MNSC-PART4-ASR', | |
| 'MNSC-PART5-ASR', | |
| 'MNSC-PART6-ASR', | |
| 'SEAME-Dev-Man', | |
| 'SEAME-Dev-Sge', | |
| ] | |
| filters_1_list = sum + dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['WER']) | |
| metric = metric.lower() | |
| if tab_section: | |
| if tab_section in sum: | |
| sum_table_mulit_metrix(dataset_list, metric) | |
| else: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def asr_sea(): | |
| st.title("Task: Automatic Speech Recognition - SEA Region") | |
| sum = ['Overall'] | |
| dataset_list = [ | |
| 'GigaSpeech2-Indo', | |
| 'GigaSpeech2-Thai', | |
| 'GigaSpeech2-Viet', | |
| ] | |
| filters_1_list = sum + dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['WER']) | |
| metric = metric.lower() | |
| if tab_section: | |
| if tab_section in sum: | |
| sum_table_mulit_metrix(dataset_list, metric) | |
| else: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def asr_mandarin(): | |
| st.title("Task: Automatic Speech Recognition - Mandarin") | |
| sum = ['Overall'] | |
| dataset_list = [ | |
| 'AISHELL-ASR-ZH', | |
| ] | |
| filters_1_list = sum + dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['WER']) | |
| metric = metric.lower() | |
| if tab_section: | |
| if tab_section in sum: | |
| sum_table_mulit_metrix(dataset_list, metric) | |
| else: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def speech_translation(): | |
| st.title("Task: Speech Translation") | |
| sum = ['Overall'] | |
| dataset_list = [ | |
| 'CoVoST2-EN-ID', | |
| 'CoVoST2-EN-ZH', | |
| 'CoVoST2-EN-TA', | |
| 'CoVoST2-ID-EN', | |
| 'CoVoST2-ZH-EN', | |
| 'CoVoST2-TA-EN'] | |
| filters_1_list = sum + dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['BLEU']) | |
| metric = metric.lower() | |
| if tab_section: | |
| if tab_section in sum: | |
| sum_table_mulit_metrix(dataset_list, metric) | |
| else: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def speech_question_answering_english(): | |
| st.title("Task: Spoken Question Answering - English") | |
| sum = ['Overall'] | |
| dataset_list = [ | |
| 'CN-College-Listen-MCQ', | |
| 'DREAM-TTS-MCQ', | |
| 'SLUE-P2-SQA5', | |
| 'Public-SG-Speech-QA', | |
| 'Spoken-SQuAD', | |
| ] | |
| filters_1_list = sum + dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE', 'GPT4O_JUDGE']) | |
| metric = metric.lower() | |
| if tab_section: | |
| if tab_section in sum: | |
| sum_table_mulit_metrix(dataset_list, metric) | |
| else: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def speech_question_answering_singlish(): | |
| st.title("Task: Spoken Question Answering - Singlish") | |
| sum = ['Overall'] | |
| dataset_list = [ | |
| 'MNSC-PART3-SQA', | |
| 'MNSC-PART4-SQA', | |
| 'MNSC-PART5-SQA', | |
| 'MNSC-PART6-SQA', | |
| ] | |
| filters_1_list = sum + dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE', 'GPT4O_JUDGE']) | |
| metric = metric.lower() | |
| if tab_section: | |
| if tab_section in sum: | |
| sum_table_mulit_metrix(dataset_list, metric) | |
| else: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def spoken_dialogue_summarization_singlish(): | |
| st.title("Task: Spoken Dialogue Summarization - Singlish") | |
| sum = ['Overall'] | |
| dataset_list = [ | |
| 'MNSC-PART3-SDS', | |
| 'MNSC-PART4-SDS', | |
| 'MNSC-PART5-SDS', | |
| 'MNSC-PART6-SDS', | |
| ] | |
| filters_1_list = sum + dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE', 'GPT4O_JUDGE']) | |
| metric = metric.lower() | |
| if tab_section: | |
| if tab_section in sum: | |
| sum_table_mulit_metrix(dataset_list, metric) | |
| else: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def speech_instruction(): | |
| st.title("Task: Speech Instruction") | |
| sum = ['Overall'] | |
| dataset_list = ['OpenHermes-Audio', | |
| 'ALPACA-Audio', | |
| ] | |
| filters_1_list = sum + dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE', 'GPT4O_JUDGE']) | |
| metric = metric.lower() | |
| if tab_section: | |
| if tab_section in sum: | |
| sum_table_mulit_metrix(dataset_list, metric) | |
| else: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def audio_captioning(): | |
| st.title("Task: Audio Captioning") | |
| dataset_list = [ 'WavCaps', | |
| 'AudioCaps', | |
| ] | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', dataset_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE', 'GPT4O_JUDGE', 'METEOR']) | |
| metric = metric.lower() | |
| if tab_section: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def audio_scene_question_answering(): | |
| st.title("Task: Audio Scene Question Answering") | |
| sum = ['Overall'] | |
| dataset_list = ['Clotho-AQA', | |
| 'WavCaps-QA', | |
| 'AudioCaps-QA'] | |
| filters_1_list = sum + dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE', 'GPT4O_JUDGE']) | |
| metric = metric.lower() | |
| if tab_section: | |
| if tab_section in sum: | |
| sum_table_mulit_metrix(dataset_list, metric) | |
| else: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def accent_recognition(): | |
| st.title("Task: Accent Recognition") | |
| sum = ['Overall'] | |
| dataset_list = [ | |
| 'VoxCeleb-Accent', | |
| 'MNSC-AR-Sentence', | |
| 'MNSC-AR-Dialogue', | |
| ] | |
| filters_1_list = sum + dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE', 'GPT4O_JUDGE']) | |
| metric = metric.lower() | |
| if tab_section: | |
| if tab_section in sum: | |
| sum_table_mulit_metrix(dataset_list, metric) | |
| else: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def gender_recognition(): | |
| st.title("Task: Gender Recognition") | |
| sum = ['Overall'] | |
| dataset_list = [ | |
| 'VoxCeleb-Gender', | |
| 'IEMOCAP-Gender' | |
| ] | |
| filters_1_list = sum + dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE', 'GPT4O_JUDGE']) | |
| metric = metric.lower() | |
| if tab_section: | |
| if tab_section in sum: | |
| sum_table_mulit_metrix(dataset_list, metric) | |
| else: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def emotion_recognition(): | |
| st.title("Task: Emotion Recognition") | |
| sum = ['Overall'] | |
| dataset_list = [ | |
| 'IEMOCAP-Emotion', | |
| 'MELD-Sentiment', | |
| 'MELD-Emotion', | |
| ] | |
| filters_1_list = sum + dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE', 'GPT4O_JUDGE']) | |
| metric = metric.lower() | |
| if tab_section: | |
| if tab_section in sum: | |
| sum_table_mulit_metrix(dataset_list, metric) | |
| else: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def music_understanding(): | |
| st.title("Task: Music Understanding - MCQ Questions") | |
| sum = ['Overall'] | |
| dataset_list = ['MuChoMusic', | |
| ] | |
| filters_1_list = sum + dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE', 'GPT4O_JUDGE']) | |
| metric = metric.lower() | |
| if tab_section: | |
| if tab_section in sum: | |
| sum_table_mulit_metrix(dataset_list, metric) | |
| else: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def under_development(): | |
| st.title("Task: Under Development") | |
| dataset_list = [ | |
| 'CNA', | |
| 'IDPC', | |
| 'Parliament', | |
| 'UKUS-News', | |
| 'Mediacorp', | |
| 'IDPC-Short', | |
| 'Parliament-Short', | |
| 'UKUS-News-Short', | |
| 'Mediacorp-Short', | |
| 'YTB-ASR-Batch1', | |
| 'YTB-ASR-Batch2', | |
| 'YTB-SQA-Batch1', | |
| 'YTB-SDS-Batch1', | |
| 'YTB-PQA-Batch1', | |
| ] | |
| filters_1_list = dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| if tab_section in [ | |
| 'CNA', | |
| 'IDPC', | |
| 'Parliament', | |
| 'UKUS-News', | |
| 'Mediacorp', | |
| 'IDPC-Short', | |
| 'Parliament-Short', | |
| 'UKUS-News-Short', | |
| 'Mediacorp-Short', | |
| 'YTB-ASR-Batch1', | |
| 'YTB-ASR-Batch2', | |
| ]: | |
| metric = st.selectbox('Metric', ['WER']) | |
| metric = metric.lower() | |
| elif tab_section in [ | |
| 'YTB-SQA-Batch1', | |
| 'YTB-SDS-Batch1', | |
| 'YTB-PQA-Batch1', | |
| ]: | |
| metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE', 'GPT4O_JUDGE']) | |
| metric = metric.lower() | |
| else: | |
| raise ValueError('Invalid dataset') | |
| if tab_section: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) | |
| def mmau_evaluation(): | |
| st.title("Task: MMAU-Audio Understanding") | |
| dataset_list = [ | |
| 'MMAU-mini', | |
| 'MMAU-mini-music', | |
| 'MMAU-mini-sound', | |
| 'MMAU-mini-speech', | |
| ] | |
| filters_1_list = dataset_list | |
| space1, space2, _, _ = st.columns([0.4, 0.4, 0.2 ,0.2]) | |
| with space1: | |
| tab_section = st.selectbox('Dataset', filters_1_list) | |
| with space2: | |
| metric = st.selectbox('Metric', ['LLAMA3_70B_JUDGE', 'STRING_MATCH', 'GPT4O_JUDGE']) | |
| metric = metric.lower() | |
| if tab_section: | |
| dataset_contents(dataset_diaplay_information[tab_section], metrics_info[metric]) | |
| draw_table(tab_section, metric) |