He Yingxu commited on
Commit
b46797f
·
1 Parent(s): 7ef552a

add meralion2

Browse files
app.py CHANGED
@@ -19,6 +19,8 @@ pages = {
19
  'ASR-English' : asr_english,
20
  'ASR-Mandarin' : asr_mandarin,
21
  'ASR-Singlish' : asr_singlish,
 
 
22
  'Speech Translation' : speech_translation,
23
  'SQA-English' : speech_question_answering_english,
24
  'SQA-Singlish' : speech_question_answering_singlish,
@@ -47,6 +49,8 @@ menu_items = [
47
  sac.MenuItem(label='ASR-English', icon='mic'),
48
  sac.MenuItem(label='ASR-Mandarin', icon='mic'),
49
  sac.MenuItem(label='ASR-Singlish', icon='mic'),
 
 
50
  ]
51
  ),
52
 
 
19
  'ASR-English' : asr_english,
20
  'ASR-Mandarin' : asr_mandarin,
21
  'ASR-Singlish' : asr_singlish,
22
+ 'ASR-SEA' : asr_sea,
23
+ 'ASR-Private' : asr_private,
24
  'Speech Translation' : speech_translation,
25
  'SQA-English' : speech_question_answering_english,
26
  'SQA-Singlish' : speech_question_answering_singlish,
 
49
  sac.MenuItem(label='ASR-English', icon='mic'),
50
  sac.MenuItem(label='ASR-Mandarin', icon='mic'),
51
  sac.MenuItem(label='ASR-Singlish', icon='mic'),
52
+ sac.MenuItem(label='ASR-SEA', icon='mic'),
53
+ sac.MenuItem(label='ASR-Private', icon='mic'),
54
  ]
55
  ),
56
 
app/content.py CHANGED
@@ -1,5 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
1
 
2
- displayname2datasetname = {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  'LibriSpeech-Clean' : 'librispeech_test_clean',
4
  'LibriSpeech-Other' : 'librispeech_test_other',
5
  'CommonVoice-15-EN' : 'common_voice_15_en_test',
@@ -9,65 +161,102 @@ displayname2datasetname = {
9
  'Earnings-22' : 'earnings22_test',
10
  'TED-LIUM-3' : 'tedlium3_test',
11
  'TED-LIUM-3-LongForm' : 'tedlium3_long_form_test',
 
 
 
 
 
 
 
 
12
  'AISHELL-ASR-ZH' : 'aishell_asr_zh_test',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  'CoVoST2-EN-ID' : 'covost2_en_id_test',
14
  'CoVoST2-EN-ZH' : 'covost2_en_zh_test',
15
  'CoVoST2-EN-TA' : 'covost2_en_ta_test',
16
  'CoVoST2-ID-EN' : 'covost2_id_en_test',
17
  'CoVoST2-ZH-EN' : 'covost2_zh_en_test',
18
  'CoVoST2-TA-EN' : 'covost2_ta_en_test',
 
19
  'CN-College-Listen-MCQ': 'cn_college_listen_mcq_test',
20
  'DREAM-TTS-MCQ' : 'dream_tts_mcq_test',
21
  'SLUE-P2-SQA5' : 'slue_p2_sqa5_test',
22
  'Public-SG-Speech-QA' : 'public_sg_speech_qa_test',
23
  'Spoken-SQuAD' : 'spoken_squad_test',
 
 
 
 
 
 
 
 
 
 
 
 
24
  'OpenHermes-Audio' : 'openhermes_audio_test',
25
  'ALPACA-Audio' : 'alpaca_audio_test',
 
26
  'WavCaps' : 'wavcaps_test',
27
  'AudioCaps' : 'audiocaps_test',
 
28
  'Clotho-AQA' : 'clotho_aqa_test',
29
  'WavCaps-QA' : 'wavcaps_qa_test',
30
  'AudioCaps-QA' : 'audiocaps_qa_test',
 
 
 
 
 
31
  'VoxCeleb-Accent' : 'voxceleb_accent_test',
32
  'MNSC-AR-Sentence' : 'imda_ar_sentence',
33
  'MNSC-AR-Dialogue' : 'imda_ar_dialogue',
 
34
  'VoxCeleb-Gender' : 'voxceleb_gender_test',
35
  'IEMOCAP-Gender' : 'iemocap_gender_test',
36
- 'IEMOCAP-Emotion' : 'iemocap_emotion_test',
37
- 'MELD-Sentiment' : 'meld_sentiment_test',
38
- 'MELD-Emotion' : 'meld_emotion_test',
39
  'MuChoMusic' : 'muchomusic_test',
40
- 'MNSC-PART1-ASR' : 'imda_part1_asr_test',
41
- 'MNSC-PART2-ASR' : 'imda_part2_asr_test',
42
- 'MNSC-PART3-ASR' : 'imda_part3_30s_asr_test',
43
- 'MNSC-PART4-ASR' : 'imda_part4_30s_asr_test',
44
- 'MNSC-PART5-ASR' : 'imda_part5_30s_asr_test',
45
- 'MNSC-PART6-ASR' : 'imda_part6_30s_asr_test',
46
- 'MNSC-PART3-SQA' : 'imda_part3_30s_sqa_human_test',
47
- 'MNSC-PART4-SQA' : 'imda_part4_30s_sqa_human_test',
48
- 'MNSC-PART5-SQA' : 'imda_part5_30s_sqa_human_test',
49
- 'MNSC-PART6-SQA' : 'imda_part6_30s_sqa_human_test',
50
- 'MNSC-PART3-SDS' : 'imda_part3_30s_ds_human_test',
51
- 'MNSC-PART4-SDS' : 'imda_part4_30s_ds_human_test',
52
- 'MNSC-PART5-SDS' : 'imda_part5_30s_ds_human_test',
53
- 'MNSC-PART6-SDS' : 'imda_part6_30s_ds_human_test',
54
 
55
- 'CNA' : 'cna_test',
56
- 'IDPC' : 'idpc_test',
57
- 'Parliament' : 'parliament_test',
58
- 'UKUS-News' : 'ukusnews_test',
59
- 'Mediacorp' : 'mediacorp_test',
60
- 'IDPC-Short' : 'idpc_short_test',
61
- 'Parliament-Short': 'parliament_short_test',
62
- 'UKUS-News-Short' : 'ukusnews_short_test',
63
- 'Mediacorp-Short' : 'mediacorp_short_test',
64
-
65
- 'YouTube ASR: English Singapore Content': 'ytb_asr_batch1',
66
- 'YouTube ASR: English with Strong Emotion': 'ytb_asr_batch2',
67
- 'YouTube ASR: Malay with English Prompt': 'ytb_asr_batch3_malay',
68
- 'YouTube ASR: Malay with Malay Prompt': 'ytb_asr_batch3_ms_ms_prompt',
69
- 'YouTube ASR: Chinese with English Prompt': 'ytb_asr_batch3_chinese',
70
- 'YouTube ASR: Chinese with Chinese Prompt': 'ytb_asr_batch3_zh_zh_prompt',
71
 
72
  'YouTube SQA: Malay': 'ytb_sqa_batch3_malay',
73
  'YouTube SQA: Chinese': 'ytb_sqa_batch3_chinese',
@@ -76,15 +265,13 @@ displayname2datasetname = {
76
  'YouTube SDS: Malay': 'ytb_sds_batch3_malay',
77
  'YouTube SDS: Chinese': 'ytb_sds_batch3_chinese',
78
  'YouTube SDS: Tamil': 'ytb_sds_batch3_tamil',
79
-
80
- 'SEAME-Dev-Mandarin' : 'seame_dev_man',
81
- 'SEAME-Dev-Singlish' : 'seame_dev_sge',
82
 
83
- 'YouTube SQA: English with Singapore Content': 'ytb_sqa_batch1',
84
- 'YouTube SDS: English with Singapore Content': 'ytb_sds_batch1',
85
- 'YouTube PQA: English with Singapore Content': 'ytb_pqa_batch1',
86
 
87
- }
 
 
 
88
 
89
  datasetname2diaplayname = {datasetname: displayname for displayname, datasetname in displayname2datasetname.items()}
90
 
@@ -152,6 +339,19 @@ dataset_diaplay_information = {
152
  'Parliament-Short': 'Under Development',
153
  'UKUS-News-Short' : 'Under Development',
154
  'Mediacorp-Short' : 'Under Development',
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  'YouTube ASR: English Singapore Content' : 'YouTube Evaluation Dataset for ASR Task: <br> This dataset contains English and Singlish audio clips, featuring Singapore-related content. <br> It includes approximately 2.5 hours of audio, with individual clips ranging from 2 seconds to 30 seconds in length.',
157
 
 
1
+ asr_english_datasets = [
2
+ 'LibriSpeech-Clean',
3
+ 'LibriSpeech-Other',
4
+ 'CommonVoice-15-EN',
5
+ 'Peoples-Speech',
6
+ 'GigaSpeech-1',
7
+ 'Earnings-21',
8
+ 'Earnings-22',
9
+ 'TED-LIUM-3',
10
+ 'TED-LIUM-3-LongForm',
11
+ ]
12
 
13
+
14
+ asr_singlish_datasets = [
15
+ 'MNSC-PART1-ASR',
16
+ 'MNSC-PART2-ASR',
17
+ 'MNSC-PART3-ASR',
18
+ 'MNSC-PART4-ASR',
19
+ 'MNSC-PART5-ASR',
20
+ 'MNSC-PART6-ASR',
21
+ ]
22
+
23
+
24
+ asr_mandarin_datasets = [
25
+ 'AISHELL-ASR-ZH',
26
+ 'CommonVoice-ZH'
27
+ ]
28
+
29
+
30
+ asr_sea_datasets = [
31
+ 'CommonVoice-17-Indonesian',
32
+ 'CommonVoice-17-Tamil',
33
+ # 'CommonVoice-17-Thai',
34
+ 'CommonVoice-17-Vietnamese',
35
+ 'GigaSpeech-2-Indonesain',
36
+ 'GigaSpeech-2-Thai',
37
+ 'GigaSpeech-2-Vietnamese',
38
+ 'Fleurs-Tamil',
39
+ 'Lotus-Thai'
40
+ ]
41
+
42
+
43
+ asr_private_datasets = [
44
+ 'CNA',
45
+ 'IDPC',
46
+ 'Parliament',
47
+ 'UKUS-News',
48
+ 'Mediacorp',
49
+ 'IDPC-Short',
50
+ 'Parliament-Short',
51
+ 'UKUS-News-Short',
52
+ 'Mediacorp-Short',
53
+ 'YouTube ASR: English Singapore Content',
54
+ 'YouTube ASR: English with Strong Emotion',
55
+ 'YouTube ASR: Malay with English Prompt',
56
+ 'YouTube ASR: Chinese with English Prompt',
57
+ 'YouTube ASR: Tamil with English Prompt'
58
+ ]
59
+
60
+
61
+ speech_translation_datasets = [
62
+ 'CoVoST2-EN-ID',
63
+ 'CoVoST2-EN-ZH',
64
+ 'CoVoST2-EN-TA',
65
+ 'CoVoST2-ID-EN',
66
+ 'CoVoST2-ZH-EN',
67
+ 'CoVoST2-TA-EN'
68
+ ]
69
+
70
+
71
+ speech_qa_english_datasets = [
72
+ 'CN-College-Listen-MCQ',
73
+ 'DREAM-TTS-MCQ',
74
+ 'SLUE-P2-SQA5',
75
+ 'Public-SG-Speech-QA',
76
+ 'Spoken-SQuAD',
77
+ 'MMAU-mini'
78
+ ]
79
+
80
+
81
+ speech_qa_singlish_datasets = [
82
+ 'MNSC-PART3-SQA',
83
+ 'MNSC-PART4-SQA',
84
+ 'MNSC-PART5-SQA',
85
+ 'MNSC-PART6-SQA',
86
+ ]
87
+
88
+
89
+ sds_datasets = [
90
+ 'MNSC-PART3-SDS',
91
+ 'MNSC-PART4-SDS',
92
+ 'MNSC-PART5-SDS',
93
+ 'MNSC-PART6-SDS',
94
+ ]
95
+
96
+
97
+ si_datasets = [
98
+ 'OpenHermes-Audio',
99
+ 'ALPACA-Audio',
100
+ ]
101
+
102
+
103
+ ac_datasets = [
104
+ 'WavCaps',
105
+ 'AudioCaps',
106
+ ]
107
+
108
+
109
+ asqa_datasets = [
110
+ 'Clotho-AQA',
111
+ 'WavCaps-QA',
112
+ 'AudioCaps-QA'
113
+ ]
114
+
115
+
116
+ er_datasets = [
117
+ 'IEMOCAP-Emotion',
118
+ 'MELD-Sentiment',
119
+ 'MELD-Emotion',
120
+ ]
121
+
122
+
123
+ ar_datasets = [
124
+ 'VoxCeleb-Accent',
125
+ 'MNSC-AR-Sentence',
126
+ 'MNSC-AR-Dialogue',
127
+ ]
128
+
129
+
130
+ gr_datasets = [
131
+ 'VoxCeleb-Gender',
132
+ 'IEMOCAP-Gender'
133
+ ]
134
+
135
+
136
+ music_datasets = ['MuChoMusic']
137
+
138
+
139
+ wer_development_datasets = [
140
+ 'YouTube ASR: Malay with Malay Prompt',
141
+ 'YouTube ASR: Chinese with Chinese Prompt',
142
+ 'SEAME-Dev-Mandarin',
143
+ 'SEAME-Dev-Singlish',
144
+ ]
145
+
146
+
147
+ non_wer_development_datasets = [
148
+ 'YouTube SQA: English with Singapore Content',
149
+ 'YouTube SDS: English with Singapore Content',
150
+ 'YouTube PQA: English with Singapore Content',
151
+ ]
152
+
153
+
154
+ wer_displayname2datasetname = {
155
  'LibriSpeech-Clean' : 'librispeech_test_clean',
156
  'LibriSpeech-Other' : 'librispeech_test_other',
157
  'CommonVoice-15-EN' : 'common_voice_15_en_test',
 
161
  'Earnings-22' : 'earnings22_test',
162
  'TED-LIUM-3' : 'tedlium3_test',
163
  'TED-LIUM-3-LongForm' : 'tedlium3_long_form_test',
164
+
165
+ 'MNSC-PART1-ASR' : 'imda_part1_asr_test',
166
+ 'MNSC-PART2-ASR' : 'imda_part2_asr_test',
167
+ 'MNSC-PART3-ASR' : 'imda_part3_30s_asr_test',
168
+ 'MNSC-PART4-ASR' : 'imda_part4_30s_asr_test',
169
+ 'MNSC-PART5-ASR' : 'imda_part5_30s_asr_test',
170
+ 'MNSC-PART6-ASR' : 'imda_part6_30s_asr_test',
171
+
172
  'AISHELL-ASR-ZH' : 'aishell_asr_zh_test',
173
+ 'CommonVoice-ZH' : 'commonvoice_zh_asr',
174
+
175
+ 'CommonVoice-17-Indonesian' : 'commonvoice_17_id_asr',
176
+ 'CommonVoice-17-Tamil' : 'commonvoice_17_ta_asr',
177
+ 'CommonVoice-17-Thai' : 'commonvoice_17_th_asr',
178
+ 'CommonVoice-17-Vietnamese' : 'commonvoice_17_vi_asr',
179
+ 'GigaSpeech-2-Indonesain' : 'gigaspeech2_id_test',
180
+ 'GigaSpeech-2-Thai' : 'gigaspeech2_th_test',
181
+ 'GigaSpeech-2-Vietnamese' : 'gigaspeech2_vi_test',
182
+ 'Fleurs-Tamil' : 'fleurs_tamil_ta_30_asr',
183
+ 'Lotus-Thai' : 'lotus_thai_th_30_asr',
184
+
185
+ 'CNA' : 'cna_test',
186
+ 'IDPC' : 'idpc_test',
187
+ 'Parliament' : 'parliament_test',
188
+ 'UKUS-News' : 'ukusnews_test',
189
+ 'Mediacorp' : 'mediacorp_test',
190
+ 'IDPC-Short' : 'idpc_short_test',
191
+ 'Parliament-Short': 'parliament_short_test',
192
+ 'UKUS-News-Short' : 'ukusnews_short_test',
193
+ 'Mediacorp-Short' : 'mediacorp_short_test',
194
+
195
+ 'YouTube ASR: English Singapore Content': 'ytb_asr_batch1',
196
+ 'YouTube ASR: English with Strong Emotion': 'ytb_asr_batch2',
197
+ 'YouTube ASR: Malay with English Prompt': 'ytb_asr_batch3_malay',
198
+ 'YouTube ASR: Chinese with English Prompt': 'ytb_asr_batch3_chinese',
199
+ 'YouTube ASR: Tamil with English Prompt': 'ytb_asr_batch3_tamil',
200
+
201
+ 'YouTube ASR: Malay with Malay Prompt': 'ytb_asr_batch3_ms_ms_prompt',
202
+ 'YouTube ASR: Chinese with Chinese Prompt': 'ytb_asr_batch3_zh_zh_prompt',
203
+
204
+ 'SEAME-Dev-Mandarin' : 'seame_dev_man',
205
+ 'SEAME-Dev-Singlish' : 'seame_dev_sge',
206
+ }
207
+
208
+
209
+ non_wer_displayname2datasetname = {
210
  'CoVoST2-EN-ID' : 'covost2_en_id_test',
211
  'CoVoST2-EN-ZH' : 'covost2_en_zh_test',
212
  'CoVoST2-EN-TA' : 'covost2_en_ta_test',
213
  'CoVoST2-ID-EN' : 'covost2_id_en_test',
214
  'CoVoST2-ZH-EN' : 'covost2_zh_en_test',
215
  'CoVoST2-TA-EN' : 'covost2_ta_en_test',
216
+
217
  'CN-College-Listen-MCQ': 'cn_college_listen_mcq_test',
218
  'DREAM-TTS-MCQ' : 'dream_tts_mcq_test',
219
  'SLUE-P2-SQA5' : 'slue_p2_sqa5_test',
220
  'Public-SG-Speech-QA' : 'public_sg_speech_qa_test',
221
  'Spoken-SQuAD' : 'spoken_squad_test',
222
+ 'MMAU-mini' : 'mmau_mini',
223
+
224
+ 'MNSC-PART3-SQA' : 'imda_part3_30s_sqa_human_test',
225
+ 'MNSC-PART4-SQA' : 'imda_part4_30s_sqa_human_test',
226
+ 'MNSC-PART5-SQA' : 'imda_part5_30s_sqa_human_test',
227
+ 'MNSC-PART6-SQA' : 'imda_part6_30s_sqa_human_test',
228
+
229
+ 'MNSC-PART3-SDS' : 'imda_part3_30s_ds_human_test',
230
+ 'MNSC-PART4-SDS' : 'imda_part4_30s_ds_human_test',
231
+ 'MNSC-PART5-SDS' : 'imda_part5_30s_ds_human_test',
232
+ 'MNSC-PART6-SDS' : 'imda_part6_30s_ds_human_test',
233
+
234
  'OpenHermes-Audio' : 'openhermes_audio_test',
235
  'ALPACA-Audio' : 'alpaca_audio_test',
236
+
237
  'WavCaps' : 'wavcaps_test',
238
  'AudioCaps' : 'audiocaps_test',
239
+
240
  'Clotho-AQA' : 'clotho_aqa_test',
241
  'WavCaps-QA' : 'wavcaps_qa_test',
242
  'AudioCaps-QA' : 'audiocaps_qa_test',
243
+
244
+ 'IEMOCAP-Emotion' : 'iemocap_emotion_test',
245
+ 'MELD-Sentiment' : 'meld_sentiment_test',
246
+ 'MELD-Emotion' : 'meld_emotion_test',
247
+
248
  'VoxCeleb-Accent' : 'voxceleb_accent_test',
249
  'MNSC-AR-Sentence' : 'imda_ar_sentence',
250
  'MNSC-AR-Dialogue' : 'imda_ar_dialogue',
251
+
252
  'VoxCeleb-Gender' : 'voxceleb_gender_test',
253
  'IEMOCAP-Gender' : 'iemocap_gender_test',
254
+
 
 
255
  'MuChoMusic' : 'muchomusic_test',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
+ 'YouTube SQA: English with Singapore Content': 'ytb_sqa_batch1',
258
+ 'YouTube SDS: English with Singapore Content': 'ytb_sds_batch1',
259
+ 'YouTube PQA: English with Singapore Content': 'ytb_pqa_batch1',
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
  'YouTube SQA: Malay': 'ytb_sqa_batch3_malay',
262
  'YouTube SQA: Chinese': 'ytb_sqa_batch3_chinese',
 
265
  'YouTube SDS: Malay': 'ytb_sds_batch3_malay',
266
  'YouTube SDS: Chinese': 'ytb_sds_batch3_chinese',
267
  'YouTube SDS: Tamil': 'ytb_sds_batch3_tamil',
268
+ }
 
 
269
 
 
 
 
270
 
271
+ displayname2datasetname = {}
272
+ displayname2datasetname.update(wer_displayname2datasetname)
273
+ displayname2datasetname.update(non_wer_displayname2datasetname)
274
+
275
 
276
  datasetname2diaplayname = {datasetname: displayname for displayname, datasetname in displayname2datasetname.items()}
277
 
 
339
  'Parliament-Short': 'Under Development',
340
  'UKUS-News-Short' : 'Under Development',
341
  'Mediacorp-Short' : 'Under Development',
342
+
343
+ 'CommonVoice-ZH' : 'Under Development',
344
+ 'CommonVoice-17-Indonesian' : 'Under Development',
345
+ 'CommonVoice-17-Tamil' : 'Under Development',
346
+ 'CommonVoice-17-Thai' : 'Under Development',
347
+ 'CommonVoice-17-Vietnamese' : 'Under Development',
348
+ 'GigaSpeech-2-Indonesain' : 'Under Development',
349
+ 'GigaSpeech-2-Thai' : 'Under Development',
350
+ 'GigaSpeech-2-Vietnamese' : 'Under Development',
351
+ 'Fleurs-Tamil' : 'Under Development',
352
+ 'Lotus-Thai' : 'Under Development',
353
+ 'MMAU-mini' : 'Under Development',
354
+
355
 
356
  'YouTube ASR: English Singapore Content' : 'YouTube Evaluation Dataset for ASR Task: <br> This dataset contains English and Singlish audio clips, featuring Singapore-related content. <br> It includes approximately 2.5 hours of audio, with individual clips ranging from 2 seconds to 30 seconds in length.',
357
 
app/draw_diagram.py CHANGED
@@ -7,6 +7,7 @@ from app.content import *
7
 
8
  import pandas as pd
9
 
 
10
  from model_information import get_dataframe
11
  info_df = get_dataframe()
12
 
@@ -81,38 +82,7 @@ def draw(folder_name, category_name, displayname, metrics, cus_sort=True):
81
 
82
  return df_style
83
 
84
- if cur_dataset_name in [
85
- 'LibriSpeech-Clean',
86
- 'LibriSpeech-Other',
87
- 'CommonVoice-15-EN',
88
- 'Peoples-Speech',
89
- 'GigaSpeech-1',
90
- 'Earnings-21',
91
- 'Earnings-22',
92
- 'TED-LIUM-3',
93
- 'TED-LIUM-3-LongForm',
94
- 'AISHELL-ASR-ZH',
95
- 'MNSC-PART1-ASR',
96
- 'MNSC-PART2-ASR',
97
- 'MNSC-PART3-ASR',
98
- 'MNSC-PART4-ASR',
99
- 'MNSC-PART5-ASR',
100
- 'MNSC-PART6-ASR',
101
- 'CNA',
102
- 'IDPC',
103
- 'Parliament',
104
- 'UKUS-News',
105
- 'Mediacorp',
106
- 'IDPC-Short',
107
- 'Parliament-Short',
108
- 'UKUS-News-Short',
109
- 'Mediacorp-Short',
110
- 'YTB-ASR-Batch1',
111
- 'YTB-ASR-Batch2',
112
- 'SEAME-Dev-Man',
113
- 'SEAME-Dev-Sge',
114
- ]:
115
-
116
  chart_data_table = chart_data_table.sort_values(
117
  by=chart_data_table.columns[1],
118
  ascending=True
 
7
 
8
  import pandas as pd
9
 
10
+ from app.content import wer_displayname2datasetname
11
  from model_information import get_dataframe
12
  info_df = get_dataframe()
13
 
 
82
 
83
  return df_style
84
 
85
+ if cur_dataset_name in wer_displayname2datasetname:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  chart_data_table = chart_data_table.sort_values(
87
  by=chart_data_table.columns[1],
88
  ascending=True
app/pages.py CHANGED
@@ -120,28 +120,12 @@ def dashboard():
120
  """)
121
 
122
 
123
-
124
-
125
-
126
-
127
-
128
  def asr_english():
129
  st.title("Task: Automatic Speech Recognition - English")
130
 
131
  sum = ['Overall']
132
- dataset_lists = [
133
- 'LibriSpeech-Clean',
134
- 'LibriSpeech-Other',
135
- 'CommonVoice-15-EN',
136
- 'Peoples-Speech',
137
- 'GigaSpeech-1',
138
- 'Earnings-21',
139
- 'Earnings-22',
140
- 'TED-LIUM-3',
141
- 'TED-LIUM-3-LongForm',
142
- ]
143
-
144
- filters_levelone = sum + dataset_lists
145
 
146
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
147
 
@@ -156,23 +140,12 @@ def asr_english():
156
  draw('su', 'asr_english', filter_1, 'wer', cus_sort=True)
157
 
158
 
159
-
160
-
161
-
162
  def asr_singlish():
163
  st.title("Task: Automatic Speech Recognition - Singlish")
164
 
165
  sum = ['Overall']
166
- dataset_lists = [
167
- 'MNSC-PART1-ASR',
168
- 'MNSC-PART2-ASR',
169
- 'MNSC-PART3-ASR',
170
- 'MNSC-PART4-ASR',
171
- 'MNSC-PART5-ASR',
172
- 'MNSC-PART6-ASR',
173
- ]
174
 
175
- filters_levelone = sum + dataset_lists
176
 
177
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
178
 
@@ -187,17 +160,12 @@ def asr_singlish():
187
  draw('su', 'asr_singlish', filter_1, 'wer')
188
 
189
 
190
-
191
-
192
  def asr_mandarin():
193
  st.title("Task: Automatic Speech Recognition - Mandarin")
194
 
195
  sum = ['Overall']
196
- dataset_lists = [
197
- 'AISHELL-ASR-ZH',
198
- ]
199
 
200
- filters_levelone = sum + dataset_lists
201
 
202
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
203
 
@@ -211,22 +179,53 @@ def asr_mandarin():
211
  dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
212
  draw('su', 'asr_mandarin', filter_1, 'wer')
213
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
 
217
  def speech_translation():
218
  st.title("Task: Speech Translation")
219
 
220
  sum = ['Overall']
221
- dataset_lists = [
222
- 'CoVoST2-EN-ID',
223
- 'CoVoST2-EN-ZH',
224
- 'CoVoST2-EN-TA',
225
- 'CoVoST2-ID-EN',
226
- 'CoVoST2-ZH-EN',
227
- 'CoVoST2-TA-EN']
228
 
229
- filters_levelone = sum + dataset_lists
230
 
231
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
232
 
@@ -241,22 +240,12 @@ def speech_translation():
241
  draw('su', 'ST', filter_1, 'bleu')
242
 
243
 
244
-
245
-
246
  def speech_question_answering_english():
247
  st.title("Task: Spoken Question Answering - English")
248
 
249
  sum = ['Overall']
250
 
251
- dataset_lists = [
252
- 'CN-College-Listen-MCQ',
253
- 'DREAM-TTS-MCQ',
254
- 'SLUE-P2-SQA5',
255
- 'Public-SG-Speech-QA',
256
- 'Spoken-SQuAD',
257
- ]
258
-
259
- filters_levelone = sum + dataset_lists
260
 
261
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
262
 
@@ -276,22 +265,12 @@ def speech_question_answering_english():
276
  draw('su', 'sqa_english', filter_1, 'llama3_70b_judge')
277
 
278
 
279
-
280
-
281
  def speech_question_answering_singlish():
282
  st.title("Task: Spoken Question Answering - Singlish")
283
 
284
  sum = ['Overall']
285
 
286
- dataset_lists = [
287
- 'MNSC-PART3-SQA',
288
- 'MNSC-PART4-SQA',
289
- 'MNSC-PART5-SQA',
290
- 'MNSC-PART6-SQA',
291
- ]
292
-
293
-
294
- filters_levelone = sum + dataset_lists
295
 
296
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
297
 
@@ -312,15 +291,7 @@ def spoken_dialogue_summarization_singlish():
312
 
313
  sum = ['Overall']
314
 
315
- dataset_lists = [
316
- 'MNSC-PART3-SDS',
317
- 'MNSC-PART4-SDS',
318
- 'MNSC-PART5-SDS',
319
- 'MNSC-PART6-SDS',
320
- ]
321
-
322
-
323
- filters_levelone = sum + dataset_lists
324
 
325
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
326
 
@@ -336,18 +307,12 @@ def spoken_dialogue_summarization_singlish():
336
  draw('su', 'sds_singlish', filter_1, 'llama3_70b_judge')
337
 
338
 
339
-
340
-
341
  def speech_instruction():
342
  st.title("Task: Speech Instruction")
343
 
344
  sum = ['Overall']
345
-
346
- dataset_lists = ['OpenHermes-Audio',
347
- 'ALPACA-Audio',
348
- ]
349
 
350
- filters_levelone = sum + dataset_lists
351
 
352
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
353
 
@@ -362,14 +327,11 @@ def speech_instruction():
362
  draw('su', 'speech_instruction', filter_1, 'llama3_70b_judge')
363
 
364
 
365
-
366
-
367
  def audio_captioning():
368
  st.title("Task: Audio Captioning")
369
 
370
- filters_levelone = ['WavCaps',
371
- 'AudioCaps',
372
- ]
373
  filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
374
 
375
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
@@ -384,18 +346,12 @@ def audio_captioning():
384
  draw('asu', 'audio_captioning', filter_1, metric.lower().replace('-', '_'))
385
 
386
 
387
-
388
-
389
  def audio_scene_question_answering():
390
  st.title("Task: Audio Scene Question Answering")
391
 
392
  sum = ['Overall']
393
-
394
- dataset_lists = ['Clotho-AQA',
395
- 'WavCaps-QA',
396
- 'AudioCaps-QA']
397
 
398
- filters_levelone = sum + dataset_lists
399
 
400
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
401
 
@@ -410,20 +366,12 @@ def audio_scene_question_answering():
410
  draw('asu', 'audio_scene_question_answering', filter_1, 'llama3_70b_judge')
411
 
412
 
413
-
414
-
415
  def emotion_recognition():
416
  st.title("Task: Emotion Recognition")
417
 
418
  sum = ['Overall']
419
 
420
- dataset_lists = [
421
- 'IEMOCAP-Emotion',
422
- 'MELD-Sentiment',
423
- 'MELD-Emotion',
424
- ]
425
-
426
- filters_levelone = sum + dataset_lists
427
 
428
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
429
 
@@ -438,20 +386,12 @@ def emotion_recognition():
438
  draw('vu', 'emotion_recognition', filter_1, 'llama3_70b_judge')
439
 
440
 
441
-
442
-
443
  def accent_recognition():
444
  st.title("Task: Accent Recognition")
445
 
446
  sum = ['Overall']
447
- dataset_lists = [
448
- 'VoxCeleb-Accent',
449
- 'MNSC-AR-Sentence',
450
- 'MNSC-AR-Dialogue',
451
- ]
452
-
453
 
454
- filters_levelone = sum + dataset_lists
455
 
456
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
457
 
@@ -467,19 +407,12 @@ def accent_recognition():
467
  draw('vu', 'accent_recognition', filter_1, 'llama3_70b_judge')
468
 
469
 
470
-
471
-
472
  def gender_recognition():
473
  st.title("Task: Gender Recognition")
474
 
475
  sum = ['Overall']
476
 
477
- dataset_lists = [
478
- 'VoxCeleb-Gender',
479
- 'IEMOCAP-Gender'
480
- ]
481
-
482
- filters_levelone = sum + dataset_lists
483
 
484
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
485
 
@@ -494,17 +427,12 @@ def gender_recognition():
494
  draw('vu', 'gender_recognition', filter_1, 'llama3_70b_judge')
495
 
496
 
497
-
498
-
499
  def music_understanding():
500
  st.title("Task: Music Understanding - MCQ Questions")
501
 
502
  sum = ['Overall']
503
 
504
- dataset_lists = ['MuChoMusic',
505
- ]
506
-
507
- filters_levelone = sum + dataset_lists
508
 
509
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
510
 
@@ -519,43 +447,10 @@ def music_understanding():
519
  draw('vu', 'music_understanding', filter_1, 'llama3_70b_judge')
520
 
521
 
522
-
523
-
524
-
525
  def under_development():
526
  st.title("Task: Under Development")
527
-
528
 
529
- dataset_lists = [
530
- 'YouTube ASR: English Singapore Content',
531
- 'YouTube ASR: English with Strong Emotion',
532
- 'YouTube ASR: Malay with English Prompt',
533
- 'YouTube ASR: Malay with Malay Prompt',
534
- 'YouTube ASR: Chinese with English Prompt',
535
- 'YouTube ASR: Chinese with Chinese Prompt',
536
-
537
- 'YouTube SQA: English with Singapore Content',
538
- 'YouTube SDS: English with Singapore Content',
539
- 'YouTube PQA: English with Singapore Content',
540
-
541
- 'CNA',
542
- 'IDPC',
543
- 'Parliament',
544
- 'UKUS-News',
545
- 'Mediacorp',
546
- 'IDPC-Short',
547
- 'Parliament-Short',
548
- 'UKUS-News-Short',
549
- 'Mediacorp-Short',
550
-
551
- 'SEAME-Dev-Mandarin',
552
- 'SEAME-Dev-Singlish',
553
-
554
-
555
-
556
- ]
557
-
558
- filters_levelone = dataset_lists
559
 
560
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
561
 
@@ -592,39 +487,8 @@ def under_development():
592
  st.markdown('To be implemented')
593
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
594
 
595
- if filter_1 in [
596
- 'CNA',
597
- 'IDPC',
598
- 'Parliament',
599
- 'UKUS-News',
600
- 'Mediacorp',
601
- 'IDPC-Short',
602
- 'Parliament-Short',
603
- 'UKUS-News-Short',
604
- 'Mediacorp-Short',
605
-
606
- 'YouTube ASR: English Singapore Content',
607
- 'YouTube ASR: English with Strong Emotion',
608
- 'YouTube ASR: Malay with English Prompt',
609
- 'YouTube ASR: Malay with Malay Prompt',
610
-
611
- 'YouTube ASR: Chinese with English Prompt',
612
- 'YouTube ASR: Chinese with Chinese Prompt',
613
-
614
- 'SEAME-Dev-Mandarin',
615
- 'SEAME-Dev-Singlish',
616
- ]:
617
-
618
  draw('vu', 'under_development_wer', filter_1, 'wer')
619
 
620
- elif filter_1 in [
621
- 'YouTube SQA: English with Singapore Content',
622
- 'YouTube SDS: English with Singapore Content',
623
- 'YouTube PQA: English with Singapore Content',
624
- ]:
625
  draw('vu', 'under_development_llama3_70b_judge', filter_1, 'llama3_70b_judge')
626
-
627
-
628
-
629
-
630
-
 
120
  """)
121
 
122
 
 
 
 
 
 
123
  def asr_english():
124
  st.title("Task: Automatic Speech Recognition - English")
125
 
126
  sum = ['Overall']
127
+
128
+ filters_levelone = sum + asr_english_datasets
 
 
 
 
 
 
 
 
 
 
 
129
 
130
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
131
 
 
140
  draw('su', 'asr_english', filter_1, 'wer', cus_sort=True)
141
 
142
 
 
 
 
143
  def asr_singlish():
144
  st.title("Task: Automatic Speech Recognition - Singlish")
145
 
146
  sum = ['Overall']
 
 
 
 
 
 
 
 
147
 
148
+ filters_levelone = sum + asr_singlish_datasets
149
 
150
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
151
 
 
160
  draw('su', 'asr_singlish', filter_1, 'wer')
161
 
162
 
 
 
163
  def asr_mandarin():
164
  st.title("Task: Automatic Speech Recognition - Mandarin")
165
 
166
  sum = ['Overall']
 
 
 
167
 
168
+ filters_levelone = sum + asr_mandarin_datasets
169
 
170
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
171
 
 
179
  dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
180
  draw('su', 'asr_mandarin', filter_1, 'wer')
181
 
182
+
183
+ def asr_sea():
184
+ st.title("Task: Automatic Speech Recognition - SEA Languages")
185
+
186
+ sum = ['Overall']
187
+
188
+ filters_levelone = sum + asr_sea_datasets
189
+
190
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
191
+
192
+ with left:
193
+ filter_1 = st.selectbox('Dataset', filters_levelone)
194
+
195
+ if filter_1:
196
+ if filter_1 in sum:
197
+ sum_table_mulit_metrix('asr_sea', ['wer'])
198
+ else:
199
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
200
+ draw('su', 'asr_sea', filter_1, 'wer')
201
+
202
+
203
+ def asr_private():
204
+ st.title("Task: Automatic Speech Recognition - Private Datasets")
205
+
206
+ sum = ['Overall']
207
+
208
+ filters_levelone = sum + asr_private_datasets
209
 
210
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
211
+
212
+ with left:
213
+ filter_1 = st.selectbox('Dataset', filters_levelone)
214
+
215
+ if filter_1:
216
+ if filter_1 in sum:
217
+ sum_table_mulit_metrix('asr_private', ['wer'])
218
+ else:
219
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
220
+ draw('su', 'asr_private', filter_1, 'wer')
221
 
222
 
223
  def speech_translation():
224
  st.title("Task: Speech Translation")
225
 
226
  sum = ['Overall']
 
 
 
 
 
 
 
227
 
228
+ filters_levelone = sum + speech_translation_datasets
229
 
230
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
231
 
 
240
  draw('su', 'ST', filter_1, 'bleu')
241
 
242
 
 
 
243
  def speech_question_answering_english():
244
  st.title("Task: Spoken Question Answering - English")
245
 
246
  sum = ['Overall']
247
 
248
+ filters_levelone = sum + speech_qa_english_datasets
 
 
 
 
 
 
 
 
249
 
250
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
251
 
 
265
  draw('su', 'sqa_english', filter_1, 'llama3_70b_judge')
266
 
267
 
 
 
268
  def speech_question_answering_singlish():
269
  st.title("Task: Spoken Question Answering - Singlish")
270
 
271
  sum = ['Overall']
272
 
273
+ filters_levelone = sum + speech_qa_singlish_datasets
 
 
 
 
 
 
 
 
274
 
275
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
276
 
 
291
 
292
  sum = ['Overall']
293
 
294
+ filters_levelone = sum + sds_datasets
 
 
 
 
 
 
 
 
295
 
296
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
297
 
 
307
  draw('su', 'sds_singlish', filter_1, 'llama3_70b_judge')
308
 
309
 
 
 
310
  def speech_instruction():
311
  st.title("Task: Speech Instruction")
312
 
313
  sum = ['Overall']
 
 
 
 
314
 
315
+ filters_levelone = sum + si_datasets
316
 
317
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
318
 
 
327
  draw('su', 'speech_instruction', filter_1, 'llama3_70b_judge')
328
 
329
 
 
 
330
  def audio_captioning():
331
  st.title("Task: Audio Captioning")
332
 
333
+ filters_levelone = ac_datasets
334
+
 
335
  filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
336
 
337
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
 
346
  draw('asu', 'audio_captioning', filter_1, metric.lower().replace('-', '_'))
347
 
348
 
 
 
349
  def audio_scene_question_answering():
350
  st.title("Task: Audio Scene Question Answering")
351
 
352
  sum = ['Overall']
 
 
 
 
353
 
354
+ filters_levelone = sum + asqa_datasets
355
 
356
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
357
 
 
366
  draw('asu', 'audio_scene_question_answering', filter_1, 'llama3_70b_judge')
367
 
368
 
 
 
369
  def emotion_recognition():
370
  st.title("Task: Emotion Recognition")
371
 
372
  sum = ['Overall']
373
 
374
+ filters_levelone = sum + er_datasets
 
 
 
 
 
 
375
 
376
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
377
 
 
386
  draw('vu', 'emotion_recognition', filter_1, 'llama3_70b_judge')
387
 
388
 
 
 
389
  def accent_recognition():
390
  st.title("Task: Accent Recognition")
391
 
392
  sum = ['Overall']
 
 
 
 
 
 
393
 
394
+ filters_levelone = sum + ar_datasets
395
 
396
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
397
 
 
407
  draw('vu', 'accent_recognition', filter_1, 'llama3_70b_judge')
408
 
409
 
 
 
410
  def gender_recognition():
411
  st.title("Task: Gender Recognition")
412
 
413
  sum = ['Overall']
414
 
415
+ filters_levelone = sum + gr_datasets
 
 
 
 
 
416
 
417
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
418
 
 
427
  draw('vu', 'gender_recognition', filter_1, 'llama3_70b_judge')
428
 
429
 
 
 
430
  def music_understanding():
431
  st.title("Task: Music Understanding - MCQ Questions")
432
 
433
  sum = ['Overall']
434
 
435
+ filters_levelone = sum + music_datasets
 
 
 
436
 
437
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
438
 
 
447
  draw('vu', 'music_understanding', filter_1, 'llama3_70b_judge')
448
 
449
 
 
 
 
450
  def under_development():
451
  st.title("Task: Under Development")
 
452
 
453
+ filters_levelone = non_wer_development_datasets + wer_development_datasets
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
 
455
  left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
456
 
 
487
  st.markdown('To be implemented')
488
  # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
489
 
490
+ if filter_1 in wer_development_datasets:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
491
  draw('vu', 'under_development_wer', filter_1, 'wer')
492
 
493
+ elif filter_1 in non_wer_development_datasets:
 
 
 
 
494
  draw('vu', 'under_development_llama3_70b_judge', filter_1, 'llama3_70b_judge')
 
 
 
 
 
app/summarization.py CHANGED
@@ -27,7 +27,6 @@ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
27
  chart_data = one_chart_data
28
  else:
29
  chart_data = pd.merge(chart_data, one_chart_data, on='Model', how='outer')
30
-
31
 
32
  selected_columns = [i for i in chart_data.columns if i != 'Model']
33
  chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
@@ -79,7 +78,8 @@ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
79
 
80
 
81
  # Format numeric columns to 2 decimal places
82
- chart_data_table[chart_data_table.columns[1]] = chart_data_table[chart_data_table.columns[1]].apply(lambda x: round(float(x), 3) if isinstance(float(x), (int, float)) else float(x))
 
83
 
84
  if metrics in ['wer']:
85
  ascend = True
 
27
  chart_data = one_chart_data
28
  else:
29
  chart_data = pd.merge(chart_data, one_chart_data, on='Model', how='outer')
 
30
 
31
  selected_columns = [i for i in chart_data.columns if i != 'Model']
32
  chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
 
78
 
79
 
80
  # Format numeric columns to 2 decimal places
81
+ target_column = chart_data_table.columns[1]
82
+ chart_data_table.loc[:, target_column] = chart_data_table[target_column].apply(lambda x: round(float(x), 3) if isinstance(float(x), (int, float)) else float(x))
83
 
84
  if metrics in ['wer']:
85
  ascend = True
model_information.py CHANGED
@@ -36,6 +36,30 @@ data['Original Name'].append('MERaLiON-AudioLLM-Whisper-SEA-LION')
36
  data['Proper Display Name'].append('Fusion: MERaLiON-AudioLLM-Whisper-SEA-LION')
37
  data['Link'].append('https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION')
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  data['Original Name'].append('cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct')
40
  data['Proper Display Name'].append('Cascade: Whisper-Large-v2 / SEA-LIONv3')
41
  data['Link'].append('https://github.com/aisingapore/sealion')
@@ -44,7 +68,6 @@ data['Original Name'].append('whisper_large_v3')
44
  data['Proper Display Name'].append('Whisper-large-v3')
45
  data['Link'].append('https://huggingface.co/openai/whisper-large-v3')
46
 
47
-
48
  data['Original Name'].append('gemini-1.5-flash')
49
  data['Proper Display Name'].append('Gemini-1.5-Flash')
50
  data['Link'].append('https://ai.google.dev/gemini-api/docs/models/gemini')
 
36
  data['Proper Display Name'].append('Fusion: MERaLiON-AudioLLM-Whisper-SEA-LION')
37
  data['Link'].append('https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION')
38
 
39
+ data['Original Name'].append('MERaLiON-AudioLLM-v2-2b')
40
+ data['Proper Display Name'].append('Fusion: MERaLiON-2-3B')
41
+ data['Link'].append('https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION')
42
+
43
+ data['Original Name'].append('MERaLiON-AudioLLM-v2-9b')
44
+ data['Proper Display Name'].append('Fusion: MERaLiON-2-10B')
45
+ data['Link'].append('https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION')
46
+
47
+ data['Original Name'].append('MERaLiON-AudioLLM-v2-9b-asr')
48
+ data['Proper Display Name'].append('Fusion: MERaLiON-2-10B-ASR')
49
+ data['Link'].append('https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION')
50
+
51
+ data['Original Name'].append('phi_4_multimodal_instruct')
52
+ data['Proper Display Name'].append('Fusion: Phi-4-multimodal-instruct')
53
+ data['Link'].append('https://huggingface.co/microsoft/Phi-4-multimodal-instruct')
54
+
55
+ data['Original Name'].append('Qwen2.5-Omni-3B')
56
+ data['Proper Display Name'].append('Fusion: Qwen2.5-Omni-3B')
57
+ data['Link'].append('https://huggingface.co/Qwen/Qwen2.5-Omni-3B')
58
+
59
+ data['Original Name'].append('Qwen2.5-Omni-7B')
60
+ data['Proper Display Name'].append('Fusion: Qwen2.5-Omni-7B')
61
+ data['Link'].append('https://huggingface.co/Qwen/Qwen2.5-Omni-7B')
62
+
63
  data['Original Name'].append('cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct')
64
  data['Proper Display Name'].append('Cascade: Whisper-Large-v2 / SEA-LIONv3')
65
  data['Link'].append('https://github.com/aisingapore/sealion')
 
68
  data['Proper Display Name'].append('Whisper-large-v3')
69
  data['Link'].append('https://huggingface.co/openai/whisper-large-v3')
70
 
 
71
  data['Original Name'].append('gemini-1.5-flash')
72
  data['Proper Display Name'].append('Gemini-1.5-Flash')
73
  data['Link'].append('https://ai.google.dev/gemini-api/docs/models/gemini')
results_organized/bleu/st.csv CHANGED
@@ -1,12 +1,17 @@
1
  Model,covost2_en_id_test,covost2_en_zh_test,covost2_en_ta_test,covost2_id_en_test,covost2_zh_en_test,covost2_ta_en_test
2
- Qwen-Audio-Chat,4.102230932924371,15.330641138043728,0.03451483807236294,0.45648619714728844,9.898238298955656,0.01699144301093184
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,37.60224687716629,43.941098854450516,14.407399367512914,44.43289180618449,18.76473995941838,5.023057608950299
4
- hy_whisper_local_cs,1.0869208512565696,0.10573269629215352,0.008950516549431693,22.267131378964944,7.31707791416422,2.8610263518826757
5
- Qwen2-Audio-7B-Instruct,16.325186897428104,25.765420247070075,0.03245972071872916,6.326113431899141,16.466557744958333,0.04425838146050298
6
- whisper_large_v3,1.600581653970121,0.16408986541757878,0.02107778621423822,46.01512198258627,14.673689493155793,2.451098639578599
7
  old_models,,,,,,
8
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,27.620150160643625,35.274306071307024,8.433062902024755,46.80524126004861,15.209998552437538,2.8327095799289337
9
  gemini-1.5-flash,,,,,,
10
- WavLLM_fairseq,13.841886973016162,31.96381187282953,0.0033159224040994286,5.933522277713613,2.368659001743569,0.1695522548322915
11
- SALMONN_7B,14.102682915273142,33.88941292215531,0.00046745670226766583,26.89649039333571,5.296039450108202,0.3649023706010388
12
- cascade_whisper_large_v3_llama_3_8b_instruct,10.930203684508578,5.987143868370054,1.0368044741318085,46.79924664837527,14.154700735606419,2.4245628096245917
 
 
 
 
 
 
 
 
1
  Model,covost2_en_id_test,covost2_en_zh_test,covost2_en_ta_test,covost2_id_en_test,covost2_zh_en_test,covost2_ta_en_test
2
+ Qwen-Audio-Chat,4.102230932924371,15.330641138043728,0.0345148380723629,0.4564861971472884,9.898238298955656,0.0169914430109318
3
+ hy_whisper_local_cs,1.0869208512565696,0.1057326962921535,0.0089505165494316,22.267131378964944,7.31707791416422,2.8610263518826757
4
+ Qwen2-Audio-7B-Instruct,16.325186897428104,25.765420247070075,0.0324597207187291,6.326113431899141,16.466557744958333,0.0442583814605029
5
+ whisper_large_v3,1.600581653970121,0.1640898654175787,0.0210777862142382,46.01512198258627,14.673689493155791,2.451098639578599
 
6
  old_models,,,,,,
 
7
  gemini-1.5-flash,,,,,,
8
+ WavLLM_fairseq,13.841886973016162,31.96381187282953,0.0033159224040994,5.933522277713613,2.368659001743569,0.1695522548322915
9
+ MERaLiON-AudioLLM-Whisper-SEA-LION,37.058238343330466,43.96331874536172,13.808713343771569,43.37364836260576,19.55610418584389,4.758175879451736
10
+ MERaLiON-AudioLLM-v2-2b,30.658188021678257,40.02820084309168,5.601731502002274,37.77329494766737,16.777825775562142,1.9423083468131173
11
+ MERaLiON-AudioLLM-v2-9b,36.242124109428445,43.747307981166834,10.885517678613343,47.85937752036512,22.133726547487697,3.4786390367027833
12
+ Qwen2.5-Omni-3B,3.2577143149506815,10.28866767786604,0.020665917336912663,15.00712601210481,8.98152195711894,0.04161842995351044
13
+ Qwen2.5-Omni-7B,2.612412992528698,12.429229982446326,0.05482974047730791,12.471476026200369,9.974234734341179,0.02999794683579762
14
+ SALMONN_7B,14.193483776951359,33.255550227097565,0.0005121531999434492,27.88515689237341,5.175547389931541,0.40577007761551664
15
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,27.59161630015759,28.71368811388653,7.474730798912167,46.80524126004861,15.209998552437538,2.8327095799289337
16
+ cascade_whisper_large_v3_llama_3_8b_instruct,10.753313930099422,6.089840198985321,1.0029597453865848,46.79744652156276,14.156349261775734,2.4177196689141547
17
+ phi_4_multimodal_instruct,14.553644350540432,45.48015814069248,0.14817117451495013,0.37716244197757426,22.330318273444895,0.07320611681035753
results_organized/llama3_70b_judge/accent_recognition.csv CHANGED
@@ -1,12 +1,17 @@
1
  Model,voxceleb_accent_test,imda_ar_sentence,imda_ar_dialogue
2
  Qwen-Audio-Chat,48.05088223225277,3.933333333333333,0.6666666666666667
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,47.01682396389003,7.816666666666666,77.83333333333333
4
  hy_whisper_local_cs,,,
5
- Qwen2-Audio-7B-Instruct,29.187525646286417,2.55,0.9666666666666667
6
  whisper_large_v3,,,
7
  old_models,,,
8
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,24.640951990151827,26.016666666666666,7.633333333333334
9
  gemini-1.5-flash,,,
10
- WavLLM_fairseq,39.96717275338531,2.6833333333333336,0.23333333333333336
11
- SALMONN_7B,34.222404595814524,2.5166666666666666,0.06666666666666667
12
- cascade_whisper_large_v3_llama_3_8b_instruct,39.32704144439885,12.416666666666666,9.666666666666666
 
 
 
 
 
 
 
 
1
  Model,voxceleb_accent_test,imda_ar_sentence,imda_ar_dialogue
2
  Qwen-Audio-Chat,48.05088223225277,3.933333333333333,0.6666666666666667
 
3
  hy_whisper_local_cs,,,
4
+ Qwen2-Audio-7B-Instruct,29.187525646286417,2.55,0.9666666666666668
5
  whisper_large_v3,,,
6
  old_models,,,
 
7
  gemini-1.5-flash,,,
8
+ WavLLM_fairseq,39.96717275338531,2.6833333333333336,0.2333333333333333
9
+ MERaLiON-AudioLLM-Whisper-SEA-LION,47.066064833812064,,
10
+ MERaLiON-AudioLLM-v2-2b,66.59827656955272,,
11
+ MERaLiON-AudioLLM-v2-9b,40.78785391875257,,
12
+ Qwen2.5-Omni-3B,0.9027492819039803,,
13
+ Qwen2.5-Omni-7B,1.661879359868691,,
14
+ SALMONN_7B,31.69881001231022,,
15
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,28.00574476815757,,
16
+ cascade_whisper_large_v3_llama_3_8b_instruct,40.29544521953221,,
17
+ phi_4_multimodal_instruct,2.6261797291752154,,
results_organized/llama3_70b_judge/audio_captioning.csv CHANGED
@@ -1,12 +1,17 @@
1
  Model,audiocaps_test,wavcaps_test
2
  Qwen-Audio-Chat,47.04090909090909,32.9364161849711
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,38.00454545454545,33.97687861271676
4
  hy_whisper_local_cs,,
5
  Qwen2-Audio-7B-Instruct,40.77727272727273,33.78034682080925
6
  whisper_large_v3,,
7
  old_models,,
8
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,3.0954545454545457,6.3468208092485545
9
  gemini-1.5-flash,,
10
  WavLLM_fairseq,5.5,6.901734104046243
11
- SALMONN_7B,37.445454545454545,23.76878612716763
12
- cascade_whisper_large_v3_llama_3_8b_instruct,2.4727272727272727,3.445086705202312
 
 
 
 
 
 
 
 
1
  Model,audiocaps_test,wavcaps_test
2
  Qwen-Audio-Chat,47.04090909090909,32.9364161849711
 
3
  hy_whisper_local_cs,,
4
  Qwen2-Audio-7B-Instruct,40.77727272727273,33.78034682080925
5
  whisper_large_v3,,
6
  old_models,,
 
7
  gemini-1.5-flash,,
8
  WavLLM_fairseq,5.5,6.901734104046243
9
+ MERaLiON-AudioLLM-Whisper-SEA-LION,39.38636363636363,34.566473988439306
10
+ MERaLiON-AudioLLM-v2-2b,35.07727272727273,31.410404624277458
11
+ MERaLiON-AudioLLM-v2-9b,36.04090909090909,35.16763005780347
12
+ Qwen2.5-Omni-3B,43.69545454545454,34.70520231213873
13
+ Qwen2.5-Omni-7B,37.7,26.09248554913295
14
+ SALMONN_7B,35.24090909090909,22.520231213872833
15
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,2.4545454545454546,3.8265895953757223
16
+ cascade_whisper_large_v3_llama_3_8b_instruct,2.5136363636363637,3.3179190751445087
17
+ phi_4_multimodal_instruct,33.595454545454544,28.069364161849713
results_organized/llama3_70b_judge/audio_scene_question_answering.csv CHANGED
@@ -1,12 +1,17 @@
1
  Model,clotho_aqa_test,audiocaps_qa_test,wavcaps_qa_test
2
  Qwen-Audio-Chat,61.934856587263,50.22364217252396,42.69736842105263
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,63.15021876519203,49.77635782747604,46.31578947368421
4
  hy_whisper_local_cs,,,
5
  Qwen2-Audio-7B-Instruct,50.919591292758774,45.75079872204473,44.473684210526315
6
  whisper_large_v3,,,
7
  old_models,,,
8
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,24.647544968400585,18.466453674121407,18.88157894736842
9
  gemini-1.5-flash,,,
10
  WavLLM_fairseq,43.01199466903598,29.840255591054312,26.25
11
- SALMONN_7B,57.75401069518716,50.287539936102235,47.30263157894737
12
- cascade_whisper_large_v3_llama_3_8b_instruct,29.47134606841404,17.380191693290733,16.710526315789473
 
 
 
 
 
 
 
 
1
  Model,clotho_aqa_test,audiocaps_qa_test,wavcaps_qa_test
2
  Qwen-Audio-Chat,61.934856587263,50.22364217252396,42.69736842105263
 
3
  hy_whisper_local_cs,,,
4
  Qwen2-Audio-7B-Instruct,50.919591292758774,45.75079872204473,44.473684210526315
5
  whisper_large_v3,,,
6
  old_models,,,
 
7
  gemini-1.5-flash,,,
8
  WavLLM_fairseq,43.01199466903598,29.840255591054312,26.25
9
+ MERaLiON-AudioLLM-Whisper-SEA-LION,62.67379679144385,48.81789137380192,45.131578947368425
10
+ MERaLiON-AudioLLM-v2-2b,50.53962080700049,44.79233226837061,43.0921052631579
11
+ MERaLiON-AudioLLM-v2-9b,58.20126397666505,50.35143769968051,44.868421052631575
12
+ Qwen2.5-Omni-3B,52.64948954788527,48.56230031948882,43.15789473684211
13
+ Qwen2.5-Omni-7B,46.592124453087024,50.41533546325879,40.0
14
+ SALMONN_7B,58.19154107924162,50.35143769968051,46.90789473684211
15
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,22.673796791443852,17.44408945686901,14.013157894736842
16
+ cascade_whisper_large_v3_llama_3_8b_instruct,29.820126397666506,17.06070287539936,18.75
17
+ phi_4_multimodal_instruct,48.37141468157511,40.319488817891376,37.96052631578947
results_organized/llama3_70b_judge/emotion_recognition.csv CHANGED
@@ -1,12 +1,17 @@
1
  Model,iemocap_emotion_test,meld_sentiment_test,meld_emotion_test
2
  Qwen-Audio-Chat,29.382470119521916,44.90421455938697,50.72796934865901
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,48.505976095617534,46.206896551724135,36.36015325670498
4
  hy_whisper_local_cs,,,
5
  Qwen2-Audio-7B-Instruct,53.98406374501992,53.9463601532567,41.60919540229885
6
  whisper_large_v3,,,
7
  old_models,,,
8
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,44.322709163346616,56.59003831417625,47.356321839080465
9
  gemini-1.5-flash,,,
10
  WavLLM_fairseq,59.76095617529881,51.072796934865906,41.57088122605364
11
- SALMONN_7B,23.804780876494025,41.7624521072797,30.536398467432953
12
- cascade_whisper_large_v3_llama_3_8b_instruct,46.713147410358566,45.593869731800766,36.81992337164751
 
 
 
 
 
 
 
 
1
  Model,iemocap_emotion_test,meld_sentiment_test,meld_emotion_test
2
  Qwen-Audio-Chat,29.382470119521916,44.90421455938697,50.72796934865901
 
3
  hy_whisper_local_cs,,,
4
  Qwen2-Audio-7B-Instruct,53.98406374501992,53.9463601532567,41.60919540229885
5
  whisper_large_v3,,,
6
  old_models,,,
 
7
  gemini-1.5-flash,,,
8
  WavLLM_fairseq,59.76095617529881,51.072796934865906,41.57088122605364
9
+ MERaLiON-AudioLLM-Whisper-SEA-LION,49.103585657370516,52.452107279693486,44.17624521072797
10
+ MERaLiON-AudioLLM-v2-2b,51.39442231075698,58.582375478927204,52.1455938697318
11
+ MERaLiON-AudioLLM-v2-9b,62.54980079681275,68.85057471264368,59.808429118773944
12
+ Qwen2.5-Omni-3B,34.36254980079681,30.421455938697317,34.32950191570881
13
+ Qwen2.5-Omni-7B,36.55378486055777,27.77777777777778,30.07662835249042
14
+ SALMONN_7B,26.195219123505975,42.26053639846744,32.298850574712645
15
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,41.98207171314741,58.39080459770115,44.272030651341
16
+ cascade_whisper_large_v3_llama_3_8b_instruct,46.91235059760957,56.47509578544061,49.42528735632184
17
+ phi_4_multimodal_instruct,32.07171314741036,49.11877394636016,40.84291187739464
results_organized/llama3_70b_judge/gender_recognition.csv CHANGED
@@ -1,12 +1,17 @@
1
- Model,voxceleb_gender_test,iemocap_gender_test,imda_gr_sentence,imda_gr_dialogue
2
- Qwen-Audio-Chat,70.5990972507181,50.0996015936255,57.550000000000004,37.2
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,99.75379565038982,93.48605577689243,66.13333333333333,93.76666666666667
4
- hy_whisper_local_cs,,,,
5
- Qwen2-Audio-7B-Instruct,99.1177677472302,92.80876494023903,68.38333333333333,61.56666666666667
6
- whisper_large_v3,,,,
7
- old_models,,,,
8
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,34.94050061551087,15.737051792828685,26.35,19.6
9
- gemini-1.5-flash,,,,
10
- WavLLM_fairseq,69.61427985227739,51.932270916334666,49.06666666666666,46.766666666666666
11
- SALMONN_7B,88.79770209273697,81.31474103585658,59.766666666666666,42.733333333333334
12
- cascade_whisper_large_v3_llama_3_8b_instruct,42.921624948707425,44.22310756972111,36.016666666666666,25.433333333333337
 
 
 
 
 
 
1
+ Model,voxceleb_gender_test,iemocap_gender_test
2
+ Qwen-Audio-Chat,70.5990972507181,50.0996015936255
3
+ hy_whisper_local_cs,,
4
+ Qwen2-Audio-7B-Instruct,99.1177677472302,92.80876494023904
5
+ whisper_large_v3,,
6
+ old_models,,
7
+ gemini-1.5-flash,,
8
+ WavLLM_fairseq,69.61427985227739,51.932270916334666
9
+ MERaLiON-AudioLLM-Whisper-SEA-LION,99.73327862125564,94.6215139442231
10
+ MERaLiON-AudioLLM-v2-2b,99.69224456298728,87.92828685258964
11
+ MERaLiON-AudioLLM-v2-9b,97.2507180960197,92.96812749003983
12
+ Qwen2.5-Omni-3B,32.78621255642183,62.948207171314735
13
+ Qwen2.5-Omni-7B,54.08288879770209,43.366533864541836
14
+ SALMONN_7B,88.53098071399262,80.199203187251
15
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,26.631103816167418,12.211155378486056
16
+ cascade_whisper_large_v3_llama_3_8b_instruct,69.69634796881412,44.38247011952191
17
+ phi_4_multimodal_instruct,94.58350430857611,46.852589641434264
results_organized/llama3_70b_judge/music_understanding.csv CHANGED
@@ -1,12 +1,17 @@
1
  Model,muchomusic_test
2
  Qwen-Audio-Chat,59.0564448188711
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,57.7927548441449
4
  hy_whisper_local_cs,
5
  Qwen2-Audio-7B-Instruct,71.60909856781802
6
  whisper_large_v3,
7
  old_models,
8
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,51.727042965459134
9
  gemini-1.5-flash,
10
  WavLLM_fairseq,44.3133951137321
11
- SALMONN_7B,50.88458298230834
12
- cascade_whisper_large_v3_llama_3_8b_instruct,56.44481887110362
 
 
 
 
 
 
 
 
1
  Model,muchomusic_test
2
  Qwen-Audio-Chat,59.0564448188711
 
3
  hy_whisper_local_cs,
4
  Qwen2-Audio-7B-Instruct,71.60909856781802
5
  whisper_large_v3,
6
  old_models,
 
7
  gemini-1.5-flash,
8
  WavLLM_fairseq,44.3133951137321
9
+ MERaLiON-AudioLLM-Whisper-SEA-LION,51.34793597304128
10
+ MERaLiON-AudioLLM-v2-2b,55.602358887952825
11
+ MERaLiON-AudioLLM-v2-9b,63.94271272114573
12
+ Qwen2.5-Omni-3B,59.30918281381634
13
+ Qwen2.5-Omni-7B,47.598989048020215
14
+ SALMONN_7B,49.70513900589722
15
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,50.463352990732936
16
+ cascade_whisper_large_v3_llama_3_8b_instruct,56.697556866048856
17
+ phi_4_multimodal_instruct,55.2653748946925
results_organized/llama3_70b_judge/sds_singlish.csv CHANGED
@@ -1,12 +1,17 @@
1
  Model,imda_part3_30s_ds_human_test,imda_part4_30s_ds_human_test,imda_part5_30s_ds_human_test,imda_part6_30s_ds_human_test
2
  Qwen-Audio-Chat,16.4,16.0,28.2,40.4
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,48.4,46.4,57.0,62.599999999999994
4
  hy_whisper_local_cs,,,,
5
  Qwen2-Audio-7B-Instruct,33.8,24.8,40.4,46.2
6
  whisper_large_v3,,,,
7
  old_models,,,,
8
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,45.4,44.0,58.0,65.4
9
  gemini-1.5-flash,,,,
10
- WavLLM_fairseq,31.6,31.6,45.199999999999996,49.400000000000006
11
- SALMONN_7B,9.0,7.0,17.2,24.2
12
- cascade_whisper_large_v3_llama_3_8b_instruct,37.400000000000006,36.0,49.0,57.199999999999996
 
 
 
 
 
 
 
 
1
  Model,imda_part3_30s_ds_human_test,imda_part4_30s_ds_human_test,imda_part5_30s_ds_human_test,imda_part6_30s_ds_human_test
2
  Qwen-Audio-Chat,16.4,16.0,28.2,40.4
 
3
  hy_whisper_local_cs,,,,
4
  Qwen2-Audio-7B-Instruct,33.8,24.8,40.4,46.2
5
  whisper_large_v3,,,,
6
  old_models,,,,
 
7
  gemini-1.5-flash,,,,
8
+ WavLLM_fairseq,31.6,31.6,45.2,49.400000000000006
9
+ MERaLiON-AudioLLM-Whisper-SEA-LION,47.800000000000004,46.4,54.6,65.6
10
+ MERaLiON-AudioLLM-v2-2b,42.2,40.199999999999996,51.8,60.0
11
+ MERaLiON-AudioLLM-v2-9b,49.8,46.6,55.4,60.599999999999994
12
+ Qwen2.5-Omni-3B,42.800000000000004,33.199999999999996,52.199999999999996,58.8
13
+ Qwen2.5-Omni-7B,39.8,31.6,42.800000000000004,58.4
14
+ SALMONN_7B,9.0,7.4,16.0,25.2
15
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,48.4,45.599999999999994,53.4,56.6
16
+ cascade_whisper_large_v3_llama_3_8b_instruct,38.0,38.199999999999996,46.2,61.0
17
+ phi_4_multimodal_instruct,43.6,42.800000000000004,55.599999999999994,61.0
results_organized/llama3_70b_judge/speech_instruction.csv CHANGED
@@ -1,12 +1,17 @@
1
  Model,openhermes_audio_test,alpaca_audio_test
2
- Qwen-Audio-Chat,10.600000000000001,9.8
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,65.6,74.80000000000001
4
  hy_whisper_local_cs,,
5
- Qwen2-Audio-7B-Instruct,44.800000000000004,52.599999999999994
6
  whisper_large_v3,,
7
  old_models,,
8
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,72.2,73.8
9
  gemini-1.5-flash,,
10
  WavLLM_fairseq,19.2,21.6
11
- SALMONN_7B,15.8,17.2
12
- cascade_whisper_large_v3_llama_3_8b_instruct,63.0,70.8
 
 
 
 
 
 
 
 
1
  Model,openhermes_audio_test,alpaca_audio_test
2
+ Qwen-Audio-Chat,10.6,9.8
 
3
  hy_whisper_local_cs,,
4
+ Qwen2-Audio-7B-Instruct,44.8,52.599999999999994
5
  whisper_large_v3,,
6
  old_models,,
 
7
  gemini-1.5-flash,,
8
  WavLLM_fairseq,19.2,21.6
9
+ MERaLiON-AudioLLM-Whisper-SEA-LION,66.39999999999999,75.19999999999999
10
+ MERaLiON-AudioLLM-v2-2b,12.6,25.6
11
+ MERaLiON-AudioLLM-v2-9b,66.2,74.2
12
+ Qwen2.5-Omni-3B,66.0,64.0
13
+ Qwen2.5-Omni-7B,57.400000000000006,59.2
14
+ SALMONN_7B,15.4,10.4
15
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,78.8,67.0
16
+ cascade_whisper_large_v3_llama_3_8b_instruct,62.800000000000004,69.4
17
+ phi_4_multimodal_instruct,39.0,33.4
results_organized/llama3_70b_judge/sqa_english.csv CHANGED
@@ -1,12 +1,15 @@
1
- Model,slue_p2_sqa5_test,public_sg_speech_qa_test,spoken_squad_test,cn_college_listen_mcq_test,dream_tts_mcq_test
2
- Qwen-Audio-Chat,79.36274509803921,63.16860465116279,64.8327415436367,63.232056362835756,59.749085206481965
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,86.76470588235293,59.7093023255814,73.66473556344609,88.50726552179657,84.31782540512285
4
- hy_whisper_local_cs,,,,,
5
- Qwen2-Audio-7B-Instruct,80.04901960784315,58.31395348837209,64.86264249672958,74.7247908410392,66.49242028227914
6
- whisper_large_v3,,,,,
7
- old_models,,,,,
8
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,88.57843137254902,73.11046511627907,88.61894972902262,91.85380889476001,89.33612127548353
9
- gemini-1.5-flash,,,,89.25583443416997,
10
- WavLLM_fairseq,83.92156862745098,58.54651162790698,77.64903756307233,66.31439894319684,66.5446941975954
11
- SALMONN_7B,83.48039215686273,59.24418604651163,66.39506634273968,50.99075297225891,56.455828541557764
12
- cascade_whisper_large_v3_llama_3_8b_instruct,82.99019607843137,64.94186046511628,83.81984675761541,85.2928225451343,86.4610559330894
 
 
 
 
1
+ Model,slue_p2_sqa5_test,public_sg_speech_qa_test,spoken_squad_test,cn_college_listen_mcq_test,dream_tts_mcq_test,mmau_mini
2
+ Qwen-Audio-Chat,79.36274509803921,63.16860465116279,64.8327415436367,63.23205636283576,59.749085206481965,
3
+ hy_whisper_local_cs,,,,,,
4
+ Qwen2-Audio-7B-Instruct,80.04901960784315,58.31395348837209,64.86264249672958,74.7247908410392,66.49242028227914,
5
+ whisper_large_v3,,,,,,
6
+ old_models,,,,,,
7
+ gemini-1.5-flash,,,,89.25583443416997,,
8
+ WavLLM_fairseq,83.92156862745098,58.54651162790698,77.64903756307233,66.31439894319684,66.5446941975954,
9
+ MERaLiON-AudioLLM-Whisper-SEA-LION,86.7156862745098,59.59302325581396,74.20669033825453,57.11140466754734,51.54208050182959,53.1
10
+ MERaLiON-AudioLLM-v2-2b,83.18627450980392,69.47674418604652,81.4614090824145,66.00616468516073,61.16048092002091,50.99999999999999
11
+ MERaLiON-AudioLLM-v2-9b,89.55882352941177,75.02906976744187,89.20949355260699,84.58828709819463,83.32462101411396,56.699999999999996
12
+ SALMONN_7B,80.88235294117646,59.38953488372093,65.64754251541768,50.81461911052399,56.56037637219028,50.6
13
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,51.51960784313726,70.93023255813954,57.16314707531303,89.52003522677234,85.15420805018296,52.6
14
+ cascade_whisper_large_v3_llama_3_8b_instruct,86.96078431372548,69.68023255813954,87.43412446271725,84.98458828709819,86.1996863565081,55.900000000000006
15
+ phi_4_multimodal_instruct,83.72549019607844,74.18604651162791,83.19566436180153,75.6494936151475,77.5222164140094,58.8
results_organized/llama3_70b_judge/sqa_singlish.csv CHANGED
@@ -1,12 +1,17 @@
1
  Model,imda_part3_30s_sqa_human_test,imda_part4_30s_sqa_human_test,imda_part5_30s_sqa_human_test,imda_part6_30s_sqa_human_test
2
- Qwen-Audio-Chat,32.2,37.8,47.800000000000004,51.4
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,51.4,53.2,64.80000000000001,67.2
4
  hy_whisper_local_cs,,,,
5
  Qwen2-Audio-7B-Instruct,42.0,39.6,51.6,53.6
6
  whisper_large_v3,,,,
7
  old_models,,,,
8
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,56.0,66.0,74.0,71.6
9
  gemini-1.5-flash,,,,
10
- WavLLM_fairseq,45.199999999999996,46.6,50.8,62.199999999999996
11
- SALMONN_7B,40.599999999999994,36.6,44.6,46.8
12
- cascade_whisper_large_v3_llama_3_8b_instruct,49.0,53.8,57.800000000000004,64.0
 
 
 
 
 
 
 
 
1
  Model,imda_part3_30s_sqa_human_test,imda_part4_30s_sqa_human_test,imda_part5_30s_sqa_human_test,imda_part6_30s_sqa_human_test
2
+ Qwen-Audio-Chat,32.2,37.8,47.8,51.4
 
3
  hy_whisper_local_cs,,,,
4
  Qwen2-Audio-7B-Instruct,42.0,39.6,51.6,53.6
5
  whisper_large_v3,,,,
6
  old_models,,,,
 
7
  gemini-1.5-flash,,,,
8
+ WavLLM_fairseq,45.2,46.6,50.8,62.2
9
+ MERaLiON-AudioLLM-Whisper-SEA-LION,55.199999999999996,50.0,63.0,67.4
10
+ MERaLiON-AudioLLM-v2-2b,52.599999999999994,54.6,61.4,70.19999999999999
11
+ MERaLiON-AudioLLM-v2-9b,59.400000000000006,63.0,72.0,71.8
12
+ Qwen2.5-Omni-3B,52.400000000000006,54.400000000000006,66.0,69.2
13
+ Qwen2.5-Omni-7B,54.2,52.0,62.800000000000004,64.6
14
+ SALMONN_7B,42.0,35.4,45.8,49.6
15
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,51.4,46.4,54.6,62.599999999999994
16
+ cascade_whisper_large_v3_llama_3_8b_instruct,51.6,55.599999999999994,62.0,68.2
17
+ phi_4_multimodal_instruct,55.0,56.4,64.6,71.8
results_organized/llama3_70b_judge/under_development_llama3_70b_judge.csv CHANGED
@@ -7,6 +7,6 @@ whisper_large_v3,,,
7
  old_models,,,
8
  cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,70.18719211822659,64.12654745529574,55.01831501831502
9
  gemini-1.5-flash,78.06896551724138,65.9697386519945,49.908424908424905
10
- WavLLM_fairseq,60.70935960591133,55.625859697386524,40.95238095238095
11
- SALMONN_7B,55.665024630541865,31.279229711141674,32.124542124542124
12
  cascade_whisper_large_v3_llama_3_8b_instruct,67.3103448275862,59.44979367262724,52.252747252747255
 
7
  old_models,,,
8
  cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,70.18719211822659,64.12654745529574,55.01831501831502
9
  gemini-1.5-flash,78.06896551724138,65.9697386519945,49.908424908424905
10
+ WavLLM_fairseq,60.70935960591133,55.62585969738653,40.95238095238095
11
+ SALMONN_7B,55.665024630541865,31.27922971114167,32.124542124542124
12
  cascade_whisper_large_v3_llama_3_8b_instruct,67.3103448275862,59.44979367262724,52.252747252747255
results_organized/meteor/audio_captioning.csv CHANGED
@@ -1,12 +1,12 @@
1
  Model,audiocaps_test,wavcaps_test
2
- Qwen-Audio-Chat,0.27553015076950976,0.2355106805560457
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,0.24920047034353812,0.3175511907248581
4
  hy_whisper_local_cs,,
5
- Qwen2-Audio-7B-Instruct,0.19891712076314283,0.21342294856199182
6
  whisper_large_v3,,
7
  old_models,,
8
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.05796819723943051,0.120421856260385
9
  gemini-1.5-flash,,
10
- WavLLM_fairseq,0.041732965094428545,0.06399522524688675
11
- SALMONN_7B,0.20994052484339956,0.17175112770658157
12
- cascade_whisper_large_v3_llama_3_8b_instruct,0.07953048457785493,0.1388630786594543
 
1
  Model,audiocaps_test,wavcaps_test
2
+ Qwen-Audio-Chat,0.2755301507695097,0.2355106805560457
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.2492004703435381,0.3175511907248581
4
  hy_whisper_local_cs,,
5
+ Qwen2-Audio-7B-Instruct,0.1989171207631428,0.2134229485619918
6
  whisper_large_v3,,
7
  old_models,,
8
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.0579681972394305,0.120421856260385
9
  gemini-1.5-flash,,
10
+ WavLLM_fairseq,0.0417329650944285,0.0639952252468867
11
+ SALMONN_7B,0.2099405248433995,0.1717511277065815
12
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.0795304845778549,0.1388630786594543
results_organized/wer/asr_english.csv CHANGED
@@ -1,12 +1,18 @@
1
  Model,librispeech_test_clean,librispeech_test_other,common_voice_15_en_test,peoples_speech_test,gigaspeech_test,earnings21_test,earnings22_test,tedlium3_test,tedlium3_long_form_test
2
- Qwen-Audio-Chat,0.020258799562379748,0.043467569561352074,0.11272421128398918,0.31419144746723354,0.13018910022587737,0.2655529121410546,0.3664994875132684,0.04052375714133636,0.2911540507002305
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,0.024333195005092994,0.04212457676811621,0.07789795695400416,0.21632867288683053,0.14468436081215577,0.1384587164122689,0.16563713100701868,0.08094105957914907,0.10501684098564085
4
- hy_whisper_local_cs,0.02554042328441544,0.053417065466169825,0.1066766923091754,0.1991585778678581,0.0948233719154953,0.10871196540338629,0.1463228189913085,0.0467690997480572,0.05275660343910654
5
- Qwen2-Audio-7B-Instruct,0.035141660693401744,0.060415760304159495,0.11438872500819404,0.2165498391593041,0.11723812890302816,0.18872219319407232,0.23542555661330924,0.06114048472375004,0.08739585179932637
6
- whisper_large_v3,0.01878749009695552,0.03660128246354058,0.10001863741235596,0.14602420615337386,0.09459022434812692,0.11863959266711877,0.15887899737116104,0.037649480146197796,0.03208650948413402
7
  old_models,,,,,,,,,
8
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.032349945297468596,0.05307658841999735,0.10600831614192711,0.20140159998943682,0.09948381629977261,0.11416493424197618,0.1448629161356777,0.04900464852205386,0.04396383619925545
9
  gemini-1.5-flash,,,,,,,,,
10
- WavLLM_fairseq,0.02103218017882069,0.04798834811886432,0.14533325621300636,0.3792176325635977,0.15491778414546403,0.6447482518259942,0.6671766188447099,0.06621482559171073,0.4536784258110264
11
- SALMONN_7B,0.10270871845172973,0.09671439650443565,0.3062255383962828,0.23699946689025367,0.10765150204693537,0.2577708974886327,0.3597423676988383,0.0459884319222171,0.14231519234178336
12
- cascade_whisper_large_v3_llama_3_8b_instruct,0.018334779492209605,0.03714982881570734,0.09876543209876543,0.14540692118393275,0.09515429104337297,0.11773910240019567,0.15611126487402763,0.038146268762641496,0.04754476156709803
 
 
 
 
 
 
 
 
 
 
1
  Model,librispeech_test_clean,librispeech_test_other,common_voice_15_en_test,peoples_speech_test,gigaspeech_test,earnings21_test,earnings22_test,tedlium3_test,tedlium3_long_form_test
2
+ Qwen-Audio-Chat,0.0202587995623797,0.043467569561352,0.1127242112839891,0.3141914474672335,0.1301891002258773,0.2655529121410546,0.3664994875132684,0.0405237571413363,0.2911540507002305
3
+ Qwen2-Audio-7B-Instruct,0.0351416606934017,0.0604157603041594,0.114388725008194,0.2165498391593041,0.1172381289030281,0.1887221931940723,0.2354255566133092,0.06114048472375,0.0873958517993263
4
+ whisper_large_v3,0.0187874900969555,0.0366012824635405,0.1000186374123559,0.1460242061533738,0.0945902243481269,0.1186395926671187,0.158878997371161,0.0376494801461977,0.032086509484134
 
 
5
  old_models,,,,,,,,,
 
6
  gemini-1.5-flash,,,,,,,,,
7
+ WavLLM_fairseq,0.0210321801788206,0.0479883481188643,0.1453332562130063,0.3792176325635977,0.154917784145464,0.6447482518259942,0.6671766188447099,0.0662148255917107,0.4536784258110264
8
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.023937073225940318,0.0422569845082944,0.07797507728099434,0.21620323529945748,0.14477210452030514,0.13838923413858656,0.16553574886426656,0.08154430289911642,0.10512320510547775
9
+ MERaLiON-AudioLLM-v2-2b,0.027124910401026145,0.050958064577146425,0.09270505973611995,0.20627055897299626,0.09237908290276242,0.21886082422652334,0.23935918375209228,0.03456229374401192,0.13837971990781775
10
+ MERaLiON-AudioLLM-v2-9b,0.02497453502848304,0.046607524542720415,0.08676036786395974,0.20476530792451958,0.09023061553464748,0.1084090226901313,0.15062142184399924,0.03513005216280473,0.043573834426520124
11
+ MERaLiON-AudioLLM-v2-9b-asr,0.020956728411363035,0.04040327614579984,0.0761563229028091,0.1957668115250735,0.08768103407213536,0.09210848128425476,0.1277414998676963,0.0313686526383024,0.03495834071973054
12
+ Qwen2.5-Omni-3B,0.01765571358509073,0.03898462178674788,0.08397118270448134,0.2217852079375585,0.09894231227233641,0.12490689375326566,0.18720009894897133,0.03211383556296796,0.052153873426697396
13
+ Qwen2.5-Omni-7B,0.02252235258610933,0.04165169198176556,0.08635548614726127,0.31617534194121266,0.12679717916513114,0.23232370957521317,0.2807910240306093,0.0633760334977467,0.09094132246055664
14
+ SALMONN_7B,0.09638963292715132,0.11776722719276675,0.315955552984878,0.24158949229136512,0.11024871580815716,0.27733154717568453,0.37956460424973665,0.039352755402576205,0.14139336996986349
15
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.03299128532085864,0.05381428868670437,0.10610471655066483,0.20285898669536326,0.09994259054523941,0.14091838890062366,0.17187922953626794,0.04939498243497392,0.08636766530756958
16
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.018032972422378994,0.035504189759207064,0.09879113887442882,0.14542012514049835,0.09501640807342393,0.10872308256717546,0.1459710229559586,0.038146268762641496,0.04935295160432548
17
+ hy_whisper_local_cs,0.029086656354925113,0.05591389713810127,0.1066766923091754,0.17879147486544342,0.10212866235970408,0.14925070316060968,0.17014458107377883,0.04666264504453355,0.06973940790639957
18
+ phi_4_multimodal_instruct,0.016844607084920964,0.03851173700039722,0.08109202383018103,0.2147161396912585,0.0988294989332872,0.1306461295594268,0.22572024408764688,0.028636315247862035,0.05062932104236838
results_organized/wer/asr_mandarin.csv CHANGED
@@ -1,12 +1,18 @@
1
- Model,aishell_asr_zh_test
2
- Qwen-Audio-Chat,0.9469917443725129
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,0.12812060739244918
4
- hy_whisper_local_cs,0.16361782582011838
5
- Qwen2-Audio-7B-Instruct,0.09260359129694522
6
- whisper_large_v3,0.12359684029221357
7
- old_models,
8
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.20886539565639167
9
- gemini-1.5-flash,
10
- WavLLM_fairseq,0.7054601967888183
11
- SALMONN_7B,0.8259290055631446
12
- cascade_whisper_large_v3_llama_3_8b_instruct,0.12450753301261111
 
 
 
 
 
 
 
1
+ Model,aishell_asr_zh_test,commonvoice_zh_asr
2
+ Qwen-Audio-Chat,0.9469917443725128,
3
+ Qwen2-Audio-7B-Instruct,0.0926035912969452,
4
+ whisper_large_v3,0.1235968402922135,
5
+ old_models,,
6
+ gemini-1.5-flash,,
7
+ WavLLM_fairseq,0.7054601967888183,
8
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.12846706657955692,0.3269799259362027
9
+ MERaLiON-AudioLLM-v2-2b,0.05010789728969927,0.13139387212789344
10
+ MERaLiON-AudioLLM-v2-9b,0.05789827958266516,0.14684695260557293
11
+ MERaLiON-AudioLLM-v2-9b-asr,0.043317297222387204,0.1183419954537208
12
+ Qwen2.5-Omni-3B,0.08080418126744669,0.08551487145555639
13
+ Qwen2.5-Omni-7B,0.08943596444338857,0.0775535468448182
14
+ SALMONN_7B,0.9314703727900854,1.0013340021130595
15
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.20889509215814378,0.31938144990021666
16
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.12450753301261111,0.1962263748225777
17
+ hy_whisper_local_cs,0.15675793391538476,0.287290695068461
18
+ phi_4_multimodal_instruct,0.12232978955079092,0.154221316286565
results_organized/wer/asr_private.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,cna_test,idpc_short_test,idpc_test,mediacorp_short_test,mediacorp_test,parliament_test,ukusnews_test,ytb_asr_batch1,ytb_asr_batch2,ytb_asr_batch3_chinese,ytb_asr_batch3_malay,ytb_asr_batch3_tamil
2
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.14503898323187012,0.16498433693003828,0.20359281437125748,0.12828873397796267,0.12250898399215943,0.058780395496262655,0.1128757799205899,0.10724437274333563,0.13268461455292463,0.418102808691044,0.28989513404414025,0.6929759165018962
3
+ MERaLiON-AudioLLM-v2-2b,0.13494606429563175,0.15106160807518274,0.17741659538066723,0.1208680008994828,0.12250898399215943,0.18544800832623712,0.17383248251087163,0.09933164323576861,0.15990917937074278,0.25613142554319024,0.2798911851169321,0.7504943113675407
4
+ MERaLiON-AudioLLM-v2-9b,0.13334401367083198,0.15663069961712495,0.16030795551753635,0.11693276366089499,0.10454099967330938,0.06024694862333239,0.06972962752883342,0.09848659445340709,0.1110174072872743,0.19133015368309486,0.20907375718485366,0.6644679264853651
5
+ MERaLiON-AudioLLM-v2-9b-asr,0.12709601623411299,0.14009745910198398,0.16612489307100087,0.11783224645828648,0.10372427311336165,0.05284322073989971,0.055965210814898844,0.09230237381885227,0.09936209319926478,0.1494223635400106,0.19463823439076827,0.5467894071504975
6
+ Qwen2.5-Omni-3B,0.15224821104346897,0.3038635572572224,0.19743370402053037,0.13660894985383404,0.1391702058150931,0.09165957044185827,0.0828512006050293,0.12241683951755397,0.24802681370959023,0.2562374138844727,2.2815585099381335,1.2873650773070564
7
+ Qwen2.5-Omni-7B,0.17280786072839902,0.4491820396797772,0.6198460222412319,0.26714639082527547,0.3391048676902973,0.2558898665909736,0.22628096048402344,0.20300376430821235,0.34827548924208024,0.19881293057763647,1.4799262866921152,1.0804025801432693
8
+ SALMONN_7B,0.1492577165438428,0.2398190045248869,0.5414884516680923,0.19901056892286936,0.3636883371447239,0.20430031223389156,0.191869918699187,0.2207497887378044,0.3495513028435506,0.8858293587705353,1.0858672282918695,0.985267900554277
9
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.15171419416853574,0.19735468151757746,0.17040205303678357,0.1541488644029683,0.15754655341391702,0.09007474690131517,0.12278313480809226,0.12475992932319274,0.12552708400908205,0.3469210386857446,0.3143784827344127,0.9665002755178114
10
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.13815016554523124,0.15344926428434932,0.16184773310521813,0.11434675061839443,0.15125775890231952,0.06537988456807645,0.08943089430894309,0.10816624414227549,0.08387933830684398,0.2698675145733969,0.3119213724715897,0.8976532365239376
11
+ hy_whisper_local_cs,0.14674783723165652,0.18308388444135051,0.17570573139435414,0.12885091072633237,0.1256125449199608,0.07257072570725707,0.16948383437322745,0.1284858262272413,0.14315061087685155,0.27520932697403283,0.2421569917950068,0.8339924151567211
12
+ phi_4_multimodal_instruct,0.19080422941364947,0.5388096066829099,0.26073567151411464,0.1217674836968743,0.19813786344331918,0.2778645094143249,0.07521270561542824,0.16939386955519706,0.23232781922369986,0.44008479067302597,3.762932736606555,2.7500567242552916
results_organized/wer/asr_sea.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,commonvoice_17_id_asr,commonvoice_17_ta_asr,commonvoice_17_vi_asr,fleurs_tamil_ta_30_asr,gigaspeech2_id_test,gigaspeech2_th_test,gigaspeech2_vi_test,lotus_thai_th_30_asr
2
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.25954549636581103,0.5284951114826634,0.9221892864704637,0.4624736472241743,0.337184855698226,0.9866395307075302,0.9818897503814326,0.8520208370756243
3
+ MERaLiON-AudioLLM-v2-2b,0.08547244456711749,0.13853008043879414,0.14196485284776625,0.1432185523541813,0.17842684134623737,0.19968394588770502,0.16825573283269715,0.014873360876594216
4
+ MERaLiON-AudioLLM-v2-9b,0.11334989419449812,0.15591770571023683,0.15646834639000634,0.16085734364019677,0.1722759890883186,0.20004788698671136,0.11314793912959634,0.018681516076881625
5
+ MERaLiON-AudioLLM-v2-9b-asr,0.07921611923820039,0.12871226564172622,0.1423883125132331,0.1383345045678145,0.16282383194620612,0.18238237758889023,0.09499798648962901,0.010670019759295851
6
+ Qwen2.5-Omni-3B,0.13731714049130556,1.0276387288835422,0.2463476603853483,1.3477160927617708,0.3110002953799107,0.4670274152998923,0.19581530154444754,0.4822705227231902
7
+ Qwen2.5-Omni-7B,0.18235348238108381,1.0684188526512177,0.22041075587550285,1.2090302178496135,0.26146334682814104,0.2936956781994493,0.22408385278119664,0.0984012933357284
8
+ SALMONN_7B,1.1888858220627472,1.4272941368377052,1.496294727927165,1.507519325368939,2.1181172136986777,1.2470441757452413,1.5460526688938172,1.1351535836177475
9
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.09977918851780293,0.23805397249380653,0.1567859411391065,0.2724525650035137,0.2191718937327333,0.276058900993655,0.17136958408249153,0.06815160768816239
10
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.07815806421933941,0.24404355317218387,0.11676900275248782,0.28397751229796203,0.1926224523482703,0.20872022028013887,0.15538061017872032,0.031794503323154304
11
+ hy_whisper_local_cs,0.10267733922163952,0.31793713743921215,0.1681134871903451,0.33113141250878425,0.21382030476256667,0.26486292350053875,0.1781020821398794,0.076019400035926
12
+ phi_4_multimodal_instruct,1.327169012788665,1.1784589191228196,1.1070294304467498,1.7016514406184118,5.803850364012302,1.7344522925894887,2.5042567310800923,1.2856834920064666
results_organized/wer/asr_singlish.csv CHANGED
@@ -1,12 +1,18 @@
1
  Model,imda_part1_asr_test,imda_part2_asr_test,imda_part3_30s_asr_test,imda_part4_30s_asr_test,imda_part5_30s_asr_test,imda_part6_30s_asr_test
2
- Qwen-Audio-Chat,0.10550313315290274,0.45479263046830615,0.6412550574306894,1.173131813552289,0.3016882870525747,0.31394240863063033
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,0.042815692585277836,0.04719584449314179,0.2139462894072284,0.3002929748896629,0.15368227517473845,0.10833508293092589
4
- hy_whisper_local_cs,0.06319947333772219,0.2719340962584206,0.23856138159502538,0.33742408429629445,0.16663991478309087,0.12873269917149824
5
- Qwen2-Audio-7B-Instruct,0.07197717796796138,0.1905689473257041,0.35076166942732234,0.5613424034000176,0.27856006770658537,0.2245352799625317
6
- whisper_large_v3,0.06844171360300393,0.3171008846684522,0.27026366524560785,0.4618189591218298,0.2143555471246589,0.1698509342851144
7
  old_models,,,,,,
8
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.07041669714480775,0.32988393799204613,0.3035544573275043,0.4779640131272869,0.22881615619208825,0.1789273082575623
9
  gemini-1.5-flash,,,,,,
10
- WavLLM_fairseq,0.10077292565771828,0.4463923382842302,0.7540934640345399,1.143645714142011,0.39796588405247263,0.42541061709652933
11
- SALMONN_7B,0.0925804013361617,0.42346400454508565,0.6569229098215983,0.7593582215292535,0.34868891450584405,0.24872817713464365
12
- cascade_whisper_large_v3_llama_3_8b_instruct,0.06922195401458074,0.31912994075156237,0.29992939962527493,0.4750971343786543,0.22004640235805695,0.17467982364056267
 
 
 
 
 
 
 
 
 
 
1
  Model,imda_part1_asr_test,imda_part2_asr_test,imda_part3_30s_asr_test,imda_part4_30s_asr_test,imda_part5_30s_asr_test,imda_part6_30s_asr_test
2
+ Qwen-Audio-Chat,0.1055031331529027,0.4547926304683061,0.6412550574306894,1.173131813552289,0.3016882870525747,0.3139424086306303
3
+ Qwen2-Audio-7B-Instruct,0.0719771779679613,0.1905689473257041,0.3507616694273223,0.5613424034000176,0.2785600677065853,0.2245352799625317
4
+ whisper_large_v3,0.0684417136030039,0.3171008846684522,0.2702636652456078,0.4618189591218298,0.2143555471246589,0.1698509342851144
 
 
5
  old_models,,,,,,
 
6
  gemini-1.5-flash,,,,,,
7
+ WavLLM_fairseq,0.1007729256577182,0.4463923382842302,0.7540934640345399,1.143645714142011,0.3979658840524726,0.4254106170965293
8
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.04303513520103382,0.0473581689797906,0.21299589974746788,0.29660878421707804,0.15406166552363165,0.1087388362215152
9
+ MERaLiON-AudioLLM-v2-2b,0.049057615877892376,0.05819332846359873,0.26414044043772233,0.3595795244502006,0.20202536078562985,0.1493725673864242
10
+ MERaLiON-AudioLLM-v2-9b,0.051959134908443665,0.14532099667234802,0.22654574089662477,0.2948987161915779,0.16760298259181977,0.12655243140231592
11
+ MERaLiON-AudioLLM-v2-9b-asr,0.04362031550971643,0.054094635175716256,0.19622831075026476,0.24570911239925058,0.1403598371539887,0.0989680065892537
12
+ Qwen2.5-Omni-3B,0.04657059956599127,0.11265319373427482,0.49541097564287073,1.0728162054093475,0.273861464154908,0.17795830036014793
13
+ Qwen2.5-Omni-7B,0.04854558310779509,0.12052593133674215,0.6256143590300595,1.1316375158747123,0.34107192365498823,0.36374941455772863
14
+ SALMONN_7B,0.09275107892619414,0.45783621459297136,0.681280039101746,0.7865181254636674,0.37533379054734356,0.25522053004731987
15
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.07053860970911661,0.3298433568703839,0.2810437993863198,0.4594298934979693,0.21829536997854984,0.17514817745764627
16
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.06922195401458074,0.31912994075156237,0.2770250088250468,0.4581096203900464,0.21391778902978215,0.1722411537654032
17
+ hy_whisper_local_cs,0.06692999780557385,0.2735167600032465,0.25580416542210876,0.3612895924757007,0.186411988735025,0.14417222500363377
18
+ phi_4_multimodal_instruct,0.057615877892375586,0.3451018586153721,0.4381839411301491,1.4697028756805695,0.23859275364433613,0.1439784234241509
results_organized/wer/under_development_wer.csv CHANGED
@@ -1,14 +1,14 @@
1
- Model,cna_test,idpc_test,parliament_test,ukusnews_test,mediacorp_test,idpc_short_test,parliament_short_test,ukusnews_short_test,mediacorp_short_test,ytb_asr_batch1,ytb_asr_batch2,seame_dev_man,seame_dev_sge,ytb_asr_batch3_malay,ytb_asr_batch3_ms_ms_prompt,ytb_asr_batch3_chinese,ytb_asr_batch3_zh_zh_prompt
2
- Qwen-Audio-Chat,0.19753284203780838,0.7710863986313088,0.26279685873781816,0.3158631121194933,0.4498529892192094,0.6008025988916491,0.09347360821020603,0.10399586086125925,0.2548909377108163,0.2297764461857571,0.4315277327278625,0.8783373786407767,1.05567969634822,2.8890790224211313,2.8990790224211313,,
3
- MERaLiON-AudioLLM-Whisper-SEA-LION,0.15924383210509452,0.30008554319931563,0.058922319992430694,0.12554358101720553,0.170859196341065,0.24918784635964075,0.056935097083623425,0.10144869855926132,0.13301101866426804,0.11484981178458939,0.15162720294085846,0.388282092772384,0.35550521901496834,0.289500241,0.3031898556447721,0.29155272919978803,0.28269210386857446
4
- hy_whisper_local_cs,0.15710776460536152,0.19863130881094954,0.058638471000094616,0.07199848742673473,0.13124795818360013,0.17638066118861073,0.06559913359634872,0.07828544137546764,0.1154711041151338,0.11546439271721595,0.22990593577684074,0.3134101941747573,0.33199669411368576,,,,
5
- Qwen2-Audio-7B-Instruct,0.2067713339741536,0.19093242087254064,0.23270886555019396,0.13843826810361126,0.18694870957203527,0.21326199120963119,0.08416492612361723,0.1194380323171217,0.17180121430177647,0.16843358684796805,0.2080008649583739,0.5522518878101402,0.5486546879304539,0.9251458909218551,0.9981132903339037
6
- whisper_large_v3,0.13841717398269784,0.19880239520958085,0.0753619074652285,0.07135564378899603,0.12054884024828487,0.1662526275558953,0.05543951935226013,0.06168908700151238,0.11715763436024286,0.12226319428439733,0.17210509244242622,0.7225930420711975,0.5377268970583734,0.237374402,0.237374402,0.21278219395866454,0.21278219395866454
7
- whisper_large_v2,,,,,,,,,,,,,,,,0.2802967673555909,0.2802967673555909
8
- old_models,,,,,,,,,,,,,,,,,
9
- cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.15171419416853574,0.16766467065868262,0.06282524363705176,0.07388920400831915,0.12455080039202875,0.16931014714313014,0.07325752301384698,0.06877338215394412,0.14571621317742298,0.1400092187139894,0.2192622950819672,0.7824973031283711,0.5840399155162387,,
10
- gemini-1.5-flash,,,,,,,,,,0.1089344703080587,,0.9690871089536138,1.1100431601824359,,
11
- WavLLM_fairseq,0.26946491509131687,0.7686911890504705,0.5216434856656259,0.5911892607298166,0.3595230316889905,0.36728454041658704,0.09512390087929656,0.2066783411605508,0.2621992354396222,0.41876008296842593,0.48091685587631094,1.2913969795037756,1.2204842511249197,,
12
- SALMONN_7B,0.15395706504325538,0.4550898203592814,0.3010928186204939,0.18918510115333712,0.32089186540346293,0.26313777947639977,0.08676929424202573,0.09042426172092653,0.1751742747919946,0.21487285856956287,0.3238620391393664,1.2721817691477886,1.0189782362484312,,
13
- cascade_whisper_large_v3_llama_3_8b_instruct,0.13798996048275125,0.17741659538066723,0.07517267480367111,0.07642276422764227,0.13598497223129696,0.15803554366520162,0.05742502771975968,0.0700867627159118,0.11434675061839443,0.12579703464700007,0.23561466104443723,0.6848705501618123,0.507882090054792,,
14
- Phi4-Multimodal-Instruct,,,,,,,,,,,,,,,,0.3390567037625861,0.21534711181770005
 
1
+ Model,seame_dev_man,seame_dev_sge,ytb_asr_batch3_ms_ms_prompt,ytb_asr_batch3_zh_zh_prompt
2
+ Qwen-Audio-Chat,0.8783373786407767,1.05567969634822,2.899079022421131,
3
+ MERaLiON-AudioLLM-Whisper-SEA-LION,0.388282092772384,0.3555052190149683,0.3031898556447721,0.2826921038685744
4
+ hy_whisper_local_cs,0.3134101941747573,0.3319966941136857,,
5
+ Qwen2-Audio-7B-Instruct,0.5522518878101402,0.5486546879304539,0.9981132903339036,
6
+ whisper_large_v3,0.7225930420711975,0.5377268970583734,0.237374402,0.2127821939586645
7
+ whisper_large_v2,,,,0.2802967673555909
8
+ old_models,,,,
9
+ cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.7824973031283711,0.5840399155162387,,
10
+ gemini-1.5-flash,0.9690871089536138,1.110043160182436,,
11
+ WavLLM_fairseq,1.2913969795037756,1.2204842511249197,,
12
+ SALMONN_7B,1.2721817691477886,1.0189782362484312,,
13
+ cascade_whisper_large_v3_llama_3_8b_instruct,0.6848705501618123,0.507882090054792,,
14
+ Phi4-Multimodal-Instruct,,,,0.2153471118177