zhuohan-7 commited on
Commit
c4c31dc
·
1 Parent(s): cbfd5f9

delete due to wrong path

Browse files
Files changed (1) hide show
  1. pages.py +0 -626
pages.py DELETED
@@ -1,626 +0,0 @@
1
- import streamlit as st
2
- from app.draw_diagram import *
3
- from app.content import *
4
- from app.summarization import *
5
- from app.show_examples import *
6
-
7
- def dataset_contents(dataset, metrics):
8
-
9
- custom_css = """
10
- <style>
11
- .my-dataset-info {
12
- # background-color: #F9EBEA;
13
- # padding: 10px;
14
- color: #050505;
15
- font-style: normal;
16
- font-size: 8px;
17
- height: auto;
18
- }
19
- </style>
20
- """
21
- st.markdown(custom_css, unsafe_allow_html=True)
22
- st.markdown(f"""<div class="my-dataset-info">
23
- <p><b>About this dataset</b>: {dataset}</p>
24
- </div>""", unsafe_allow_html=True)
25
- st.markdown(f"""<div class="my-dataset-info">
26
- <p><b>About this metric</b>: {metrics}</p>
27
- </div>""", unsafe_allow_html=True)
28
-
29
-
30
- def dashboard():
31
-
32
- with st.container():
33
- st.title("Leaderboard for AudioBench")
34
-
35
- st.markdown("""
36
- [gh1]: https://github.com/AudioLLMs/AudioBench
37
- [gh2]: https://github.com/AudioLLMs/AudioBench
38
- **Toolkit:** [![GitHub Repo stars](https://img.shields.io/github/stars/AudioLLMs/AudioBench?style=social)][gh1] |
39
- [**Paper @ NAACL 2025**](https://arxiv.org/abs/2406.16020) |
40
- **Resource for AudioLLMs:** [![GitHub Repo stars](https://img.shields.io/github/stars/AudioLLMs/Awesome-Audio-LLM?style=social)][gh2]
41
- """)
42
-
43
-
44
- st.markdown("""
45
- #### Recent updates
46
- - **Jan. 2025**: AudioBench is officially accepted to NAACL 2025!
47
- - **Jan. 2025**: Update the layout.
48
- - **Dec. 2024**: Added MuChoMusic dataset for Music Understanding - MCQ Questions. From Paper: https://arxiv.org/abs/2408.01337.
49
- - **Dec. 2024**: Singlish ASR task added! The datasets are available on [HF](https://huggingface.co/datasets/MERaLiON/MNSC).
50
- - **Dec. 2024**: Updated layout and added support for comparison between models with similar sizes. 1) Reorganized layout for a better user experience. 2) Added performance summary for each task.
51
- - **Aug. 2024**: Initial leaderboard is now online.
52
- """)
53
-
54
- st.divider()
55
-
56
- st.markdown("""
57
- #### Evaluating Audio-based Large Language Models
58
-
59
- - AudioBench is a comprehensive evaluation benchmark designed for general instruction-following audio large language models.
60
- - AudioBench is an evaluation benchmark that we continually improve and maintain.
61
-
62
- Below are the initial 26 datasets that are included in AudioBench. We are now exteneded to over 40 datasets and going to extend to more in the future.
63
- """
64
- )
65
-
66
-
67
- with st.container():
68
-
69
- st.markdown('''
70
- ''')
71
-
72
- st.markdown("###### :dart: Our Benchmark includes: ")
73
- cols = st.columns(8)
74
- cols[0].metric(label="Tasks", value=">8")
75
- cols[1].metric(label="Datasets", value=">40")
76
- cols[2].metric(label="Evaluated Models", value=">5")
77
-
78
- st.divider()
79
- with st.container():
80
- left_co, right_co = st.columns([1, 0.1])
81
-
82
- with left_co:
83
- st.markdown("""
84
- ##### Citations :round_pushpin:
85
- ```
86
- @article{wang2024audiobench,
87
- title={AudioBench: A Universal Benchmark for Audio Large Language Models},
88
- author={Wang, Bin and Zou, Xunlong and Lin, Geyu and Sun, Shuo and Liu, Zhuohan and Zhang, Wenyu and Liu, Zhengyuan and Aw, AiTi and Chen, Nancy F},
89
- journal={NAACL},
90
- year={2025}
91
- }
92
- ```
93
- ```
94
- @article{zhang2024mowe,
95
- title={MoWE-Audio: Multitask AudioLLMs with Mixture of Weak Encoders},
96
- author={Zhang, Wenyu and Sun, Shuo and Wang, Bin and Zou, Xunlong and Liu, Zhuohan and He, Yingxu and Lin, Geyu and Chen, Nancy F and Aw, Ai Ti},
97
- journal={ICASSP},
98
- year={2025}
99
- }
100
- ```
101
- ```
102
- @article{wang2025advancing,
103
- title={Advancing Singlish Understanding: Bridging the Gap with Datasets and Multimodal Models},
104
- author={Wang, Bin and Zou, Xunlong and Sun, Shuo and Zhang, Wenyu and He, Yingxu and Liu, Zhuohan and Wei, Chengwei and Chen, Nancy F and Aw, AiTi},
105
- journal={arXiv preprint arXiv:2501.01034},
106
- year={2025}
107
- }
108
- ```
109
- ```
110
- @article{he2024meralion,
111
- title={MERaLiON-AudioLLM: Technical Report},
112
- author={He, Yingxu and Liu, Zhuohan and Sun, Shuo and Wang, Bin and Zhang, Wenyu and Zou, Xunlong and Chen, Nancy F and Aw, Ai Ti},
113
- journal={arXiv preprint arXiv:2412.09818},
114
- year={2024}
115
- }
116
- ```
117
-
118
- """)
119
-
120
-
121
-
122
-
123
-
124
-
125
-
126
- def asr_english():
127
- st.title("Task: Automatic Speech Recognition - English")
128
-
129
- sum = ['Overall']
130
- dataset_lists = [
131
- 'LibriSpeech-Clean',
132
- 'LibriSpeech-Other',
133
- 'CommonVoice-15-EN',
134
- 'Peoples-Speech',
135
- 'GigaSpeech-1',
136
- 'Earnings-21',
137
- 'Earnings-22',
138
- 'TED-LIUM-3',
139
- 'TED-LIUM-3-LongForm',
140
- ]
141
-
142
- filters_levelone = sum + dataset_lists
143
-
144
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
145
-
146
- with left:
147
- filter_1 = st.selectbox('Dataset', filters_levelone)
148
-
149
- if filter_1:
150
- if filter_1 in sum:
151
- sum_table_mulit_metrix('asr_english', ['wer'])
152
- else:
153
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
154
- draw('su', 'asr_english', filter_1, 'wer', cus_sort=True)
155
-
156
-
157
-
158
-
159
-
160
- def asr_singlish():
161
- st.title("Task: Automatic Speech Recognition - Singlish")
162
-
163
- sum = ['Overall']
164
- dataset_lists = [
165
- 'MNSC-PART1-ASR',
166
- 'MNSC-PART2-ASR',
167
- 'MNSC-PART3-ASR',
168
- 'MNSC-PART4-ASR',
169
- 'MNSC-PART5-ASR',
170
- 'MNSC-PART6-ASR',
171
- ]
172
-
173
- filters_levelone = sum + dataset_lists
174
-
175
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
176
-
177
- with left:
178
- filter_1 = st.selectbox('Dataset', filters_levelone)
179
-
180
- if filter_1:
181
- if filter_1 in sum:
182
- sum_table_mulit_metrix('asr_singlish', ['wer'])
183
- else:
184
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
185
- draw('su', 'asr_singlish', filter_1, 'wer')
186
-
187
-
188
-
189
-
190
- def asr_mandarin():
191
- st.title("Task: Automatic Speech Recognition - Mandarin")
192
-
193
- sum = ['Overall']
194
- dataset_lists = [
195
- 'AISHELL-ASR-ZH',
196
- ]
197
-
198
- filters_levelone = sum + dataset_lists
199
-
200
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
201
-
202
- with left:
203
- filter_1 = st.selectbox('Dataset', filters_levelone)
204
-
205
- if filter_1:
206
- if filter_1 in sum:
207
- sum_table_mulit_metrix('asr_mandarin', ['wer'])
208
- else:
209
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
210
- draw('su', 'asr_mandarin', filter_1, 'wer')
211
-
212
-
213
-
214
-
215
- def speech_translation():
216
- st.title("Task: Speech Translation")
217
-
218
- sum = ['Overall']
219
- dataset_lists = [
220
- 'CoVoST2-EN-ID',
221
- 'CoVoST2-EN-ZH',
222
- 'CoVoST2-EN-TA',
223
- 'CoVoST2-ID-EN',
224
- 'CoVoST2-ZH-EN',
225
- 'CoVoST2-TA-EN']
226
-
227
- filters_levelone = sum + dataset_lists
228
-
229
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
230
-
231
- with left:
232
- filter_1 = st.selectbox('Dataset', filters_levelone)
233
-
234
- if filter_1:
235
- if filter_1 in sum:
236
- sum_table_mulit_metrix('st', ['bleu'])
237
- else:
238
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['bleu'])
239
- draw('su', 'ST', filter_1, 'bleu')
240
-
241
-
242
-
243
-
244
- def speech_question_answering_english():
245
- st.title("Task: Spoken Question Answering - English")
246
-
247
- sum = ['Overall']
248
-
249
- dataset_lists = [
250
- 'CN-College-Listen-MCQ',
251
- 'DREAM-TTS-MCQ',
252
- 'SLUE-P2-SQA5',
253
- 'Public-SG-Speech-QA',
254
- 'Spoken-SQuAD',
255
- ]
256
-
257
- filters_levelone = sum + dataset_lists
258
-
259
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
260
-
261
- with left:
262
- filter_1 = st.selectbox('Dataset', filters_levelone)
263
-
264
- if filter_1:
265
- if filter_1 in sum:
266
- sum_table_mulit_metrix('sqa_english', ['llama3_70b_judge'])
267
-
268
- #elif filter_1 in dataset_lists:
269
- # dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
270
- # draw('su', 'SQA', filter_1, 'llama3_70b_judge')
271
-
272
- else:
273
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
274
- draw('su', 'sqa_english', filter_1, 'llama3_70b_judge')
275
-
276
-
277
-
278
-
279
- def speech_question_answering_singlish():
280
- st.title("Task: Spoken Question Answering - Singlish")
281
-
282
- sum = ['Overall']
283
-
284
- dataset_lists = [
285
- 'MNSC-PART3-SQA',
286
- 'MNSC-PART4-SQA',
287
- 'MNSC-PART5-SQA',
288
- 'MNSC-PART6-SQA',
289
- ]
290
-
291
-
292
- filters_levelone = sum + dataset_lists
293
-
294
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
295
-
296
- with left:
297
- filter_1 = st.selectbox('Dataset', filters_levelone)
298
-
299
- if filter_1:
300
- if filter_1 in sum:
301
- sum_table_mulit_metrix('sqa_singlish', ['llama3_70b_judge'])
302
-
303
- else:
304
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
305
- draw('su', 'sqa_singlish', filter_1, 'llama3_70b_judge')
306
-
307
-
308
- def spoken_dialogue_summarization_singlish():
309
- st.title("Task: Spoken Dialogue Summarization - Singlish")
310
-
311
- sum = ['Overall']
312
-
313
- dataset_lists = [
314
- 'MNSC-PART3-SDS',
315
- 'MNSC-PART4-SDS',
316
- 'MNSC-PART5-SDS',
317
- 'MNSC-PART6-SDS',
318
- ]
319
-
320
-
321
- filters_levelone = sum + dataset_lists
322
-
323
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
324
-
325
- with left:
326
- filter_1 = st.selectbox('Dataset', filters_levelone)
327
-
328
- if filter_1:
329
- if filter_1 in sum:
330
- sum_table_mulit_metrix('sds_singlish', ['llama3_70b_judge'])
331
-
332
- else:
333
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
334
- draw('su', 'sds_singlish', filter_1, 'llama3_70b_judge')
335
-
336
-
337
-
338
-
339
- def speech_instruction():
340
- st.title("Task: Speech Instruction")
341
-
342
- sum = ['Overall']
343
-
344
- dataset_lists = ['OpenHermes-Audio',
345
- 'ALPACA-Audio',
346
- ]
347
-
348
- filters_levelone = sum + dataset_lists
349
-
350
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
351
-
352
- with left:
353
- filter_1 = st.selectbox('Dataset', filters_levelone)
354
-
355
- if filter_1:
356
- if filter_1 in sum:
357
- sum_table_mulit_metrix('speech_instruction', ['llama3_70b_judge'])
358
- else:
359
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
360
- draw('su', 'speech_instruction', filter_1, 'llama3_70b_judge')
361
-
362
-
363
-
364
-
365
- def audio_captioning():
366
- st.title("Task: Audio Captioning")
367
-
368
- filters_levelone = ['WavCaps',
369
- 'AudioCaps',
370
- ]
371
- filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
372
-
373
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
374
-
375
- with left:
376
- filter_1 = st.selectbox('Dataset', filters_levelone)
377
- with middle:
378
- metric = st.selectbox('Metric', filters_leveltwo)
379
-
380
- if filter_1 or metric:
381
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info[metric.lower().replace('-', '_')])
382
- draw('asu', 'audio_captioning', filter_1, metric.lower().replace('-', '_'))
383
-
384
-
385
-
386
-
387
- def audio_scene_question_answering():
388
- st.title("Task: Audio Scene Question Answering")
389
-
390
- sum = ['Overall']
391
-
392
- dataset_lists = ['Clotho-AQA',
393
- 'WavCaps-QA',
394
- 'AudioCaps-QA']
395
-
396
- filters_levelone = sum + dataset_lists
397
-
398
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
399
-
400
- with left:
401
- filter_1 = st.selectbox('Dataset', filters_levelone)
402
-
403
- if filter_1:
404
- if filter_1 in sum:
405
- sum_table_mulit_metrix('audio_scene_question_answering', ['llama3_70b_judge'])
406
- else:
407
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
408
- draw('asu', 'audio_scene_question_answering', filter_1, 'llama3_70b_judge')
409
-
410
-
411
-
412
-
413
- def emotion_recognition():
414
- st.title("Task: Emotion Recognition")
415
-
416
- sum = ['Overall']
417
-
418
- dataset_lists = [
419
- 'IEMOCAP-Emotion',
420
- 'MELD-Sentiment',
421
- 'MELD-Emotion',
422
- ]
423
-
424
- filters_levelone = sum + dataset_lists
425
-
426
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
427
-
428
- with left:
429
- filter_1 = st.selectbox('Dataset', filters_levelone)
430
-
431
- if filter_1:
432
- if filter_1 in sum:
433
- sum_table_mulit_metrix('emotion_recognition', ['llama3_70b_judge'])
434
- else:
435
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
436
- draw('vu', 'emotion_recognition', filter_1, 'llama3_70b_judge')
437
-
438
-
439
-
440
-
441
- def accent_recognition():
442
- st.title("Task: Accent Recognition")
443
-
444
- sum = ['Overall']
445
- dataset_lists = [
446
- 'VoxCeleb-Accent',
447
- 'MNSC-AR-Sentence',
448
- 'MNSC-AR-Dialogue',
449
- ]
450
-
451
-
452
- filters_levelone = sum + dataset_lists
453
-
454
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
455
-
456
- with left:
457
- filter_1 = st.selectbox('Dataset', filters_levelone)
458
-
459
-
460
- if filter_1:
461
- if filter_1 in sum:
462
- sum_table_mulit_metrix('accent_recognition', ['llama3_70b_judge'])
463
- else:
464
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
465
- draw('vu', 'accent_recognition', filter_1, 'llama3_70b_judge')
466
-
467
-
468
-
469
-
470
- def gender_recognition():
471
- st.title("Task: Gender Recognition")
472
-
473
- sum = ['Overall']
474
-
475
- dataset_lists = [
476
- 'VoxCeleb-Gender',
477
- 'IEMOCAP-Gender'
478
- ]
479
-
480
- filters_levelone = sum + dataset_lists
481
-
482
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
483
-
484
- with left:
485
- filter_1 = st.selectbox('Dataset', filters_levelone)
486
-
487
- if filter_1:
488
- if filter_1 in sum:
489
- sum_table_mulit_metrix('gender_recognition', ['llama3_70b_judge'])
490
- else:
491
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
492
- draw('vu', 'gender_recognition', filter_1, 'llama3_70b_judge')
493
-
494
-
495
-
496
-
497
- def music_understanding():
498
- st.title("Task: Music Understanding - MCQ Questions")
499
-
500
- sum = ['Overall']
501
-
502
- dataset_lists = ['MuChoMusic',
503
- ]
504
-
505
- filters_levelone = sum + dataset_lists
506
-
507
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
508
-
509
- with left:
510
- filter_1 = st.selectbox('Dataset', filters_levelone)
511
-
512
- if filter_1:
513
- if filter_1 in sum:
514
- sum_table_mulit_metrix('music_understanding', ['llama3_70b_judge'])
515
- else:
516
- dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
517
- draw('vu', 'music_understanding', filter_1, 'llama3_70b_judge')
518
-
519
-
520
-
521
-
522
-
523
-
524
-
525
-
526
-
527
-
528
- def under_development():
529
- st.title("Task: Under Development")
530
-
531
-
532
- dataset_lists = [
533
- 'CNA',
534
- 'IDPC',
535
- 'Parliament',
536
- 'UKUS-News',
537
- 'Mediacorp',
538
- 'IDPC-Short',
539
- 'Parliament-Short',
540
- 'UKUS-News-Short',
541
- 'Mediacorp-Short',
542
-
543
- 'YouTube ASR: English Singapore Content',
544
- 'YouTube ASR: English with Strong Emotion',
545
- 'YouTube ASR: Malay English Prompt',
546
- 'YouTube ASR: Malay with Malay Prompt',
547
-
548
- 'SEAME-Dev-Mandarin',
549
- 'SEAME-Dev-Singlish',
550
-
551
- 'YouTube SQA: English with Singapore Content',
552
- 'YouTube SDS: English with Singapore Content',
553
- 'YouTube PQA: English with Singapore Content',
554
-
555
- ]
556
-
557
- filters_levelone = dataset_lists
558
-
559
- left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
560
-
561
- with left:
562
- filter_1 = st.selectbox('Dataset', filters_levelone)
563
-
564
- dataset_contents(dataset_diaplay_information[filter_1], 'under_development')
565
-
566
- # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
567
-
568
- '''
569
- Show Dataset Examples
570
- '''
571
-
572
- # Initialize a session state variable for toggling the chart visibility
573
- if "show_dataset_examples" not in st.session_state:
574
- st.session_state.show_dataset_examples = False
575
-
576
- # Create a button to toggle visibility
577
- if st.button("Show Dataset Examples"):
578
- st.session_state.show_dataset_examples = not st.session_state.show_dataset_examples
579
-
580
- if st.session_state.show_dataset_examples:
581
-
582
- # st.markdown('To be implemented')
583
-
584
- # # if dataset_name in ['Earnings21-Test', 'Earnings22-Test', 'Tedlium3-Test', 'Tedlium3-Long-form-Test']:
585
- if filter_1 in []:
586
- pass
587
- else:
588
- try:
589
- show_dataset_examples(filter_1)
590
- except:
591
- st.markdown('To be implemented')
592
- # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
593
-
594
- if filter_1 in [
595
- 'CNA',
596
- 'IDPC',
597
- 'Parliament',
598
- 'UKUS-News',
599
- 'Mediacorp',
600
- 'IDPC-Short',
601
- 'Parliament-Short',
602
- 'UKUS-News-Short',
603
- 'Mediacorp-Short',
604
-
605
- 'YouTube ASR: English Singapore Content',
606
- 'YouTube ASR: English with Strong Emotion',
607
- 'YouTube ASR: Malay English Prompt',
608
- 'YouTube ASR: Malay with Malay Prompt',
609
-
610
- 'SEAME-Dev-Mandarin',
611
- 'SEAME-Dev-Singlish',
612
- ]:
613
-
614
- draw('vu', 'under_development_wer', filter_1, 'wer')
615
-
616
- elif filter_1 in [
617
- 'YouTube SQA: English with Singapore Content',
618
- 'YouTube SDS: English with Singapore Content',
619
- 'YouTube PQA: English with Singapore Content',
620
- ]:
621
- draw('vu', 'under_development_llama3_70b_judge', filter_1, 'llama3_70b_judge')
622
-
623
-
624
-
625
-
626
-