He Yingxu
commited on
Commit
·
b46797f
1
Parent(s):
7ef552a
add meralion2
Browse files- app.py +4 -0
- app/content.py +241 -41
- app/draw_diagram.py +2 -32
- app/pages.py +57 -193
- app/summarization.py +2 -2
- model_information.py +24 -1
- results_organized/bleu/st.csv +14 -9
- results_organized/llama3_70b_judge/accent_recognition.csv +11 -6
- results_organized/llama3_70b_judge/audio_captioning.csv +9 -4
- results_organized/llama3_70b_judge/audio_scene_question_answering.csv +9 -4
- results_organized/llama3_70b_judge/emotion_recognition.csv +9 -4
- results_organized/llama3_70b_judge/gender_recognition.csv +17 -12
- results_organized/llama3_70b_judge/music_understanding.csv +9 -4
- results_organized/llama3_70b_judge/sds_singlish.csv +10 -5
- results_organized/llama3_70b_judge/speech_instruction.csv +11 -6
- results_organized/llama3_70b_judge/sqa_english.csv +15 -12
- results_organized/llama3_70b_judge/sqa_singlish.csv +11 -6
- results_organized/llama3_70b_judge/under_development_llama3_70b_judge.csv +2 -2
- results_organized/meteor/audio_captioning.csv +7 -7
- results_organized/wer/asr_english.csv +15 -9
- results_organized/wer/asr_mandarin.csv +18 -12
- results_organized/wer/asr_private.csv +12 -0
- results_organized/wer/asr_sea.csv +12 -0
- results_organized/wer/asr_singlish.csv +15 -9
- results_organized/wer/under_development_wer.csv +14 -14
app.py
CHANGED
@@ -19,6 +19,8 @@ pages = {
|
|
19 |
'ASR-English' : asr_english,
|
20 |
'ASR-Mandarin' : asr_mandarin,
|
21 |
'ASR-Singlish' : asr_singlish,
|
|
|
|
|
22 |
'Speech Translation' : speech_translation,
|
23 |
'SQA-English' : speech_question_answering_english,
|
24 |
'SQA-Singlish' : speech_question_answering_singlish,
|
@@ -47,6 +49,8 @@ menu_items = [
|
|
47 |
sac.MenuItem(label='ASR-English', icon='mic'),
|
48 |
sac.MenuItem(label='ASR-Mandarin', icon='mic'),
|
49 |
sac.MenuItem(label='ASR-Singlish', icon='mic'),
|
|
|
|
|
50 |
]
|
51 |
),
|
52 |
|
|
|
19 |
'ASR-English' : asr_english,
|
20 |
'ASR-Mandarin' : asr_mandarin,
|
21 |
'ASR-Singlish' : asr_singlish,
|
22 |
+
'ASR-SEA' : asr_sea,
|
23 |
+
'ASR-Private' : asr_private,
|
24 |
'Speech Translation' : speech_translation,
|
25 |
'SQA-English' : speech_question_answering_english,
|
26 |
'SQA-Singlish' : speech_question_answering_singlish,
|
|
|
49 |
sac.MenuItem(label='ASR-English', icon='mic'),
|
50 |
sac.MenuItem(label='ASR-Mandarin', icon='mic'),
|
51 |
sac.MenuItem(label='ASR-Singlish', icon='mic'),
|
52 |
+
sac.MenuItem(label='ASR-SEA', icon='mic'),
|
53 |
+
sac.MenuItem(label='ASR-Private', icon='mic'),
|
54 |
]
|
55 |
),
|
56 |
|
app/content.py
CHANGED
@@ -1,5 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
'LibriSpeech-Clean' : 'librispeech_test_clean',
|
4 |
'LibriSpeech-Other' : 'librispeech_test_other',
|
5 |
'CommonVoice-15-EN' : 'common_voice_15_en_test',
|
@@ -9,65 +161,102 @@ displayname2datasetname = {
|
|
9 |
'Earnings-22' : 'earnings22_test',
|
10 |
'TED-LIUM-3' : 'tedlium3_test',
|
11 |
'TED-LIUM-3-LongForm' : 'tedlium3_long_form_test',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
'AISHELL-ASR-ZH' : 'aishell_asr_zh_test',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
'CoVoST2-EN-ID' : 'covost2_en_id_test',
|
14 |
'CoVoST2-EN-ZH' : 'covost2_en_zh_test',
|
15 |
'CoVoST2-EN-TA' : 'covost2_en_ta_test',
|
16 |
'CoVoST2-ID-EN' : 'covost2_id_en_test',
|
17 |
'CoVoST2-ZH-EN' : 'covost2_zh_en_test',
|
18 |
'CoVoST2-TA-EN' : 'covost2_ta_en_test',
|
|
|
19 |
'CN-College-Listen-MCQ': 'cn_college_listen_mcq_test',
|
20 |
'DREAM-TTS-MCQ' : 'dream_tts_mcq_test',
|
21 |
'SLUE-P2-SQA5' : 'slue_p2_sqa5_test',
|
22 |
'Public-SG-Speech-QA' : 'public_sg_speech_qa_test',
|
23 |
'Spoken-SQuAD' : 'spoken_squad_test',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
'OpenHermes-Audio' : 'openhermes_audio_test',
|
25 |
'ALPACA-Audio' : 'alpaca_audio_test',
|
|
|
26 |
'WavCaps' : 'wavcaps_test',
|
27 |
'AudioCaps' : 'audiocaps_test',
|
|
|
28 |
'Clotho-AQA' : 'clotho_aqa_test',
|
29 |
'WavCaps-QA' : 'wavcaps_qa_test',
|
30 |
'AudioCaps-QA' : 'audiocaps_qa_test',
|
|
|
|
|
|
|
|
|
|
|
31 |
'VoxCeleb-Accent' : 'voxceleb_accent_test',
|
32 |
'MNSC-AR-Sentence' : 'imda_ar_sentence',
|
33 |
'MNSC-AR-Dialogue' : 'imda_ar_dialogue',
|
|
|
34 |
'VoxCeleb-Gender' : 'voxceleb_gender_test',
|
35 |
'IEMOCAP-Gender' : 'iemocap_gender_test',
|
36 |
-
|
37 |
-
'MELD-Sentiment' : 'meld_sentiment_test',
|
38 |
-
'MELD-Emotion' : 'meld_emotion_test',
|
39 |
'MuChoMusic' : 'muchomusic_test',
|
40 |
-
'MNSC-PART1-ASR' : 'imda_part1_asr_test',
|
41 |
-
'MNSC-PART2-ASR' : 'imda_part2_asr_test',
|
42 |
-
'MNSC-PART3-ASR' : 'imda_part3_30s_asr_test',
|
43 |
-
'MNSC-PART4-ASR' : 'imda_part4_30s_asr_test',
|
44 |
-
'MNSC-PART5-ASR' : 'imda_part5_30s_asr_test',
|
45 |
-
'MNSC-PART6-ASR' : 'imda_part6_30s_asr_test',
|
46 |
-
'MNSC-PART3-SQA' : 'imda_part3_30s_sqa_human_test',
|
47 |
-
'MNSC-PART4-SQA' : 'imda_part4_30s_sqa_human_test',
|
48 |
-
'MNSC-PART5-SQA' : 'imda_part5_30s_sqa_human_test',
|
49 |
-
'MNSC-PART6-SQA' : 'imda_part6_30s_sqa_human_test',
|
50 |
-
'MNSC-PART3-SDS' : 'imda_part3_30s_ds_human_test',
|
51 |
-
'MNSC-PART4-SDS' : 'imda_part4_30s_ds_human_test',
|
52 |
-
'MNSC-PART5-SDS' : 'imda_part5_30s_ds_human_test',
|
53 |
-
'MNSC-PART6-SDS' : 'imda_part6_30s_ds_human_test',
|
54 |
|
55 |
-
'
|
56 |
-
'
|
57 |
-
'
|
58 |
-
'UKUS-News' : 'ukusnews_test',
|
59 |
-
'Mediacorp' : 'mediacorp_test',
|
60 |
-
'IDPC-Short' : 'idpc_short_test',
|
61 |
-
'Parliament-Short': 'parliament_short_test',
|
62 |
-
'UKUS-News-Short' : 'ukusnews_short_test',
|
63 |
-
'Mediacorp-Short' : 'mediacorp_short_test',
|
64 |
-
|
65 |
-
'YouTube ASR: English Singapore Content': 'ytb_asr_batch1',
|
66 |
-
'YouTube ASR: English with Strong Emotion': 'ytb_asr_batch2',
|
67 |
-
'YouTube ASR: Malay with English Prompt': 'ytb_asr_batch3_malay',
|
68 |
-
'YouTube ASR: Malay with Malay Prompt': 'ytb_asr_batch3_ms_ms_prompt',
|
69 |
-
'YouTube ASR: Chinese with English Prompt': 'ytb_asr_batch3_chinese',
|
70 |
-
'YouTube ASR: Chinese with Chinese Prompt': 'ytb_asr_batch3_zh_zh_prompt',
|
71 |
|
72 |
'YouTube SQA: Malay': 'ytb_sqa_batch3_malay',
|
73 |
'YouTube SQA: Chinese': 'ytb_sqa_batch3_chinese',
|
@@ -76,15 +265,13 @@ displayname2datasetname = {
|
|
76 |
'YouTube SDS: Malay': 'ytb_sds_batch3_malay',
|
77 |
'YouTube SDS: Chinese': 'ytb_sds_batch3_chinese',
|
78 |
'YouTube SDS: Tamil': 'ytb_sds_batch3_tamil',
|
79 |
-
|
80 |
-
'SEAME-Dev-Mandarin' : 'seame_dev_man',
|
81 |
-
'SEAME-Dev-Singlish' : 'seame_dev_sge',
|
82 |
|
83 |
-
'YouTube SQA: English with Singapore Content': 'ytb_sqa_batch1',
|
84 |
-
'YouTube SDS: English with Singapore Content': 'ytb_sds_batch1',
|
85 |
-
'YouTube PQA: English with Singapore Content': 'ytb_pqa_batch1',
|
86 |
|
87 |
-
}
|
|
|
|
|
|
|
88 |
|
89 |
datasetname2diaplayname = {datasetname: displayname for displayname, datasetname in displayname2datasetname.items()}
|
90 |
|
@@ -152,6 +339,19 @@ dataset_diaplay_information = {
|
|
152 |
'Parliament-Short': 'Under Development',
|
153 |
'UKUS-News-Short' : 'Under Development',
|
154 |
'Mediacorp-Short' : 'Under Development',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
'YouTube ASR: English Singapore Content' : 'YouTube Evaluation Dataset for ASR Task: <br> This dataset contains English and Singlish audio clips, featuring Singapore-related content. <br> It includes approximately 2.5 hours of audio, with individual clips ranging from 2 seconds to 30 seconds in length.',
|
157 |
|
|
|
1 |
+
asr_english_datasets = [
|
2 |
+
'LibriSpeech-Clean',
|
3 |
+
'LibriSpeech-Other',
|
4 |
+
'CommonVoice-15-EN',
|
5 |
+
'Peoples-Speech',
|
6 |
+
'GigaSpeech-1',
|
7 |
+
'Earnings-21',
|
8 |
+
'Earnings-22',
|
9 |
+
'TED-LIUM-3',
|
10 |
+
'TED-LIUM-3-LongForm',
|
11 |
+
]
|
12 |
|
13 |
+
|
14 |
+
asr_singlish_datasets = [
|
15 |
+
'MNSC-PART1-ASR',
|
16 |
+
'MNSC-PART2-ASR',
|
17 |
+
'MNSC-PART3-ASR',
|
18 |
+
'MNSC-PART4-ASR',
|
19 |
+
'MNSC-PART5-ASR',
|
20 |
+
'MNSC-PART6-ASR',
|
21 |
+
]
|
22 |
+
|
23 |
+
|
24 |
+
asr_mandarin_datasets = [
|
25 |
+
'AISHELL-ASR-ZH',
|
26 |
+
'CommonVoice-ZH'
|
27 |
+
]
|
28 |
+
|
29 |
+
|
30 |
+
asr_sea_datasets = [
|
31 |
+
'CommonVoice-17-Indonesian',
|
32 |
+
'CommonVoice-17-Tamil',
|
33 |
+
# 'CommonVoice-17-Thai',
|
34 |
+
'CommonVoice-17-Vietnamese',
|
35 |
+
'GigaSpeech-2-Indonesain',
|
36 |
+
'GigaSpeech-2-Thai',
|
37 |
+
'GigaSpeech-2-Vietnamese',
|
38 |
+
'Fleurs-Tamil',
|
39 |
+
'Lotus-Thai'
|
40 |
+
]
|
41 |
+
|
42 |
+
|
43 |
+
asr_private_datasets = [
|
44 |
+
'CNA',
|
45 |
+
'IDPC',
|
46 |
+
'Parliament',
|
47 |
+
'UKUS-News',
|
48 |
+
'Mediacorp',
|
49 |
+
'IDPC-Short',
|
50 |
+
'Parliament-Short',
|
51 |
+
'UKUS-News-Short',
|
52 |
+
'Mediacorp-Short',
|
53 |
+
'YouTube ASR: English Singapore Content',
|
54 |
+
'YouTube ASR: English with Strong Emotion',
|
55 |
+
'YouTube ASR: Malay with English Prompt',
|
56 |
+
'YouTube ASR: Chinese with English Prompt',
|
57 |
+
'YouTube ASR: Tamil with English Prompt'
|
58 |
+
]
|
59 |
+
|
60 |
+
|
61 |
+
speech_translation_datasets = [
|
62 |
+
'CoVoST2-EN-ID',
|
63 |
+
'CoVoST2-EN-ZH',
|
64 |
+
'CoVoST2-EN-TA',
|
65 |
+
'CoVoST2-ID-EN',
|
66 |
+
'CoVoST2-ZH-EN',
|
67 |
+
'CoVoST2-TA-EN'
|
68 |
+
]
|
69 |
+
|
70 |
+
|
71 |
+
speech_qa_english_datasets = [
|
72 |
+
'CN-College-Listen-MCQ',
|
73 |
+
'DREAM-TTS-MCQ',
|
74 |
+
'SLUE-P2-SQA5',
|
75 |
+
'Public-SG-Speech-QA',
|
76 |
+
'Spoken-SQuAD',
|
77 |
+
'MMAU-mini'
|
78 |
+
]
|
79 |
+
|
80 |
+
|
81 |
+
speech_qa_singlish_datasets = [
|
82 |
+
'MNSC-PART3-SQA',
|
83 |
+
'MNSC-PART4-SQA',
|
84 |
+
'MNSC-PART5-SQA',
|
85 |
+
'MNSC-PART6-SQA',
|
86 |
+
]
|
87 |
+
|
88 |
+
|
89 |
+
sds_datasets = [
|
90 |
+
'MNSC-PART3-SDS',
|
91 |
+
'MNSC-PART4-SDS',
|
92 |
+
'MNSC-PART5-SDS',
|
93 |
+
'MNSC-PART6-SDS',
|
94 |
+
]
|
95 |
+
|
96 |
+
|
97 |
+
si_datasets = [
|
98 |
+
'OpenHermes-Audio',
|
99 |
+
'ALPACA-Audio',
|
100 |
+
]
|
101 |
+
|
102 |
+
|
103 |
+
ac_datasets = [
|
104 |
+
'WavCaps',
|
105 |
+
'AudioCaps',
|
106 |
+
]
|
107 |
+
|
108 |
+
|
109 |
+
asqa_datasets = [
|
110 |
+
'Clotho-AQA',
|
111 |
+
'WavCaps-QA',
|
112 |
+
'AudioCaps-QA'
|
113 |
+
]
|
114 |
+
|
115 |
+
|
116 |
+
er_datasets = [
|
117 |
+
'IEMOCAP-Emotion',
|
118 |
+
'MELD-Sentiment',
|
119 |
+
'MELD-Emotion',
|
120 |
+
]
|
121 |
+
|
122 |
+
|
123 |
+
ar_datasets = [
|
124 |
+
'VoxCeleb-Accent',
|
125 |
+
'MNSC-AR-Sentence',
|
126 |
+
'MNSC-AR-Dialogue',
|
127 |
+
]
|
128 |
+
|
129 |
+
|
130 |
+
gr_datasets = [
|
131 |
+
'VoxCeleb-Gender',
|
132 |
+
'IEMOCAP-Gender'
|
133 |
+
]
|
134 |
+
|
135 |
+
|
136 |
+
music_datasets = ['MuChoMusic']
|
137 |
+
|
138 |
+
|
139 |
+
wer_development_datasets = [
|
140 |
+
'YouTube ASR: Malay with Malay Prompt',
|
141 |
+
'YouTube ASR: Chinese with Chinese Prompt',
|
142 |
+
'SEAME-Dev-Mandarin',
|
143 |
+
'SEAME-Dev-Singlish',
|
144 |
+
]
|
145 |
+
|
146 |
+
|
147 |
+
non_wer_development_datasets = [
|
148 |
+
'YouTube SQA: English with Singapore Content',
|
149 |
+
'YouTube SDS: English with Singapore Content',
|
150 |
+
'YouTube PQA: English with Singapore Content',
|
151 |
+
]
|
152 |
+
|
153 |
+
|
154 |
+
wer_displayname2datasetname = {
|
155 |
'LibriSpeech-Clean' : 'librispeech_test_clean',
|
156 |
'LibriSpeech-Other' : 'librispeech_test_other',
|
157 |
'CommonVoice-15-EN' : 'common_voice_15_en_test',
|
|
|
161 |
'Earnings-22' : 'earnings22_test',
|
162 |
'TED-LIUM-3' : 'tedlium3_test',
|
163 |
'TED-LIUM-3-LongForm' : 'tedlium3_long_form_test',
|
164 |
+
|
165 |
+
'MNSC-PART1-ASR' : 'imda_part1_asr_test',
|
166 |
+
'MNSC-PART2-ASR' : 'imda_part2_asr_test',
|
167 |
+
'MNSC-PART3-ASR' : 'imda_part3_30s_asr_test',
|
168 |
+
'MNSC-PART4-ASR' : 'imda_part4_30s_asr_test',
|
169 |
+
'MNSC-PART5-ASR' : 'imda_part5_30s_asr_test',
|
170 |
+
'MNSC-PART6-ASR' : 'imda_part6_30s_asr_test',
|
171 |
+
|
172 |
'AISHELL-ASR-ZH' : 'aishell_asr_zh_test',
|
173 |
+
'CommonVoice-ZH' : 'commonvoice_zh_asr',
|
174 |
+
|
175 |
+
'CommonVoice-17-Indonesian' : 'commonvoice_17_id_asr',
|
176 |
+
'CommonVoice-17-Tamil' : 'commonvoice_17_ta_asr',
|
177 |
+
'CommonVoice-17-Thai' : 'commonvoice_17_th_asr',
|
178 |
+
'CommonVoice-17-Vietnamese' : 'commonvoice_17_vi_asr',
|
179 |
+
'GigaSpeech-2-Indonesain' : 'gigaspeech2_id_test',
|
180 |
+
'GigaSpeech-2-Thai' : 'gigaspeech2_th_test',
|
181 |
+
'GigaSpeech-2-Vietnamese' : 'gigaspeech2_vi_test',
|
182 |
+
'Fleurs-Tamil' : 'fleurs_tamil_ta_30_asr',
|
183 |
+
'Lotus-Thai' : 'lotus_thai_th_30_asr',
|
184 |
+
|
185 |
+
'CNA' : 'cna_test',
|
186 |
+
'IDPC' : 'idpc_test',
|
187 |
+
'Parliament' : 'parliament_test',
|
188 |
+
'UKUS-News' : 'ukusnews_test',
|
189 |
+
'Mediacorp' : 'mediacorp_test',
|
190 |
+
'IDPC-Short' : 'idpc_short_test',
|
191 |
+
'Parliament-Short': 'parliament_short_test',
|
192 |
+
'UKUS-News-Short' : 'ukusnews_short_test',
|
193 |
+
'Mediacorp-Short' : 'mediacorp_short_test',
|
194 |
+
|
195 |
+
'YouTube ASR: English Singapore Content': 'ytb_asr_batch1',
|
196 |
+
'YouTube ASR: English with Strong Emotion': 'ytb_asr_batch2',
|
197 |
+
'YouTube ASR: Malay with English Prompt': 'ytb_asr_batch3_malay',
|
198 |
+
'YouTube ASR: Chinese with English Prompt': 'ytb_asr_batch3_chinese',
|
199 |
+
'YouTube ASR: Tamil with English Prompt': 'ytb_asr_batch3_tamil',
|
200 |
+
|
201 |
+
'YouTube ASR: Malay with Malay Prompt': 'ytb_asr_batch3_ms_ms_prompt',
|
202 |
+
'YouTube ASR: Chinese with Chinese Prompt': 'ytb_asr_batch3_zh_zh_prompt',
|
203 |
+
|
204 |
+
'SEAME-Dev-Mandarin' : 'seame_dev_man',
|
205 |
+
'SEAME-Dev-Singlish' : 'seame_dev_sge',
|
206 |
+
}
|
207 |
+
|
208 |
+
|
209 |
+
non_wer_displayname2datasetname = {
|
210 |
'CoVoST2-EN-ID' : 'covost2_en_id_test',
|
211 |
'CoVoST2-EN-ZH' : 'covost2_en_zh_test',
|
212 |
'CoVoST2-EN-TA' : 'covost2_en_ta_test',
|
213 |
'CoVoST2-ID-EN' : 'covost2_id_en_test',
|
214 |
'CoVoST2-ZH-EN' : 'covost2_zh_en_test',
|
215 |
'CoVoST2-TA-EN' : 'covost2_ta_en_test',
|
216 |
+
|
217 |
'CN-College-Listen-MCQ': 'cn_college_listen_mcq_test',
|
218 |
'DREAM-TTS-MCQ' : 'dream_tts_mcq_test',
|
219 |
'SLUE-P2-SQA5' : 'slue_p2_sqa5_test',
|
220 |
'Public-SG-Speech-QA' : 'public_sg_speech_qa_test',
|
221 |
'Spoken-SQuAD' : 'spoken_squad_test',
|
222 |
+
'MMAU-mini' : 'mmau_mini',
|
223 |
+
|
224 |
+
'MNSC-PART3-SQA' : 'imda_part3_30s_sqa_human_test',
|
225 |
+
'MNSC-PART4-SQA' : 'imda_part4_30s_sqa_human_test',
|
226 |
+
'MNSC-PART5-SQA' : 'imda_part5_30s_sqa_human_test',
|
227 |
+
'MNSC-PART6-SQA' : 'imda_part6_30s_sqa_human_test',
|
228 |
+
|
229 |
+
'MNSC-PART3-SDS' : 'imda_part3_30s_ds_human_test',
|
230 |
+
'MNSC-PART4-SDS' : 'imda_part4_30s_ds_human_test',
|
231 |
+
'MNSC-PART5-SDS' : 'imda_part5_30s_ds_human_test',
|
232 |
+
'MNSC-PART6-SDS' : 'imda_part6_30s_ds_human_test',
|
233 |
+
|
234 |
'OpenHermes-Audio' : 'openhermes_audio_test',
|
235 |
'ALPACA-Audio' : 'alpaca_audio_test',
|
236 |
+
|
237 |
'WavCaps' : 'wavcaps_test',
|
238 |
'AudioCaps' : 'audiocaps_test',
|
239 |
+
|
240 |
'Clotho-AQA' : 'clotho_aqa_test',
|
241 |
'WavCaps-QA' : 'wavcaps_qa_test',
|
242 |
'AudioCaps-QA' : 'audiocaps_qa_test',
|
243 |
+
|
244 |
+
'IEMOCAP-Emotion' : 'iemocap_emotion_test',
|
245 |
+
'MELD-Sentiment' : 'meld_sentiment_test',
|
246 |
+
'MELD-Emotion' : 'meld_emotion_test',
|
247 |
+
|
248 |
'VoxCeleb-Accent' : 'voxceleb_accent_test',
|
249 |
'MNSC-AR-Sentence' : 'imda_ar_sentence',
|
250 |
'MNSC-AR-Dialogue' : 'imda_ar_dialogue',
|
251 |
+
|
252 |
'VoxCeleb-Gender' : 'voxceleb_gender_test',
|
253 |
'IEMOCAP-Gender' : 'iemocap_gender_test',
|
254 |
+
|
|
|
|
|
255 |
'MuChoMusic' : 'muchomusic_test',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
256 |
|
257 |
+
'YouTube SQA: English with Singapore Content': 'ytb_sqa_batch1',
|
258 |
+
'YouTube SDS: English with Singapore Content': 'ytb_sds_batch1',
|
259 |
+
'YouTube PQA: English with Singapore Content': 'ytb_pqa_batch1',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
|
261 |
'YouTube SQA: Malay': 'ytb_sqa_batch3_malay',
|
262 |
'YouTube SQA: Chinese': 'ytb_sqa_batch3_chinese',
|
|
|
265 |
'YouTube SDS: Malay': 'ytb_sds_batch3_malay',
|
266 |
'YouTube SDS: Chinese': 'ytb_sds_batch3_chinese',
|
267 |
'YouTube SDS: Tamil': 'ytb_sds_batch3_tamil',
|
268 |
+
}
|
|
|
|
|
269 |
|
|
|
|
|
|
|
270 |
|
271 |
+
displayname2datasetname = {}
|
272 |
+
displayname2datasetname.update(wer_displayname2datasetname)
|
273 |
+
displayname2datasetname.update(non_wer_displayname2datasetname)
|
274 |
+
|
275 |
|
276 |
datasetname2diaplayname = {datasetname: displayname for displayname, datasetname in displayname2datasetname.items()}
|
277 |
|
|
|
339 |
'Parliament-Short': 'Under Development',
|
340 |
'UKUS-News-Short' : 'Under Development',
|
341 |
'Mediacorp-Short' : 'Under Development',
|
342 |
+
|
343 |
+
'CommonVoice-ZH' : 'Under Development',
|
344 |
+
'CommonVoice-17-Indonesian' : 'Under Development',
|
345 |
+
'CommonVoice-17-Tamil' : 'Under Development',
|
346 |
+
'CommonVoice-17-Thai' : 'Under Development',
|
347 |
+
'CommonVoice-17-Vietnamese' : 'Under Development',
|
348 |
+
'GigaSpeech-2-Indonesain' : 'Under Development',
|
349 |
+
'GigaSpeech-2-Thai' : 'Under Development',
|
350 |
+
'GigaSpeech-2-Vietnamese' : 'Under Development',
|
351 |
+
'Fleurs-Tamil' : 'Under Development',
|
352 |
+
'Lotus-Thai' : 'Under Development',
|
353 |
+
'MMAU-mini' : 'Under Development',
|
354 |
+
|
355 |
|
356 |
'YouTube ASR: English Singapore Content' : 'YouTube Evaluation Dataset for ASR Task: <br> This dataset contains English and Singlish audio clips, featuring Singapore-related content. <br> It includes approximately 2.5 hours of audio, with individual clips ranging from 2 seconds to 30 seconds in length.',
|
357 |
|
app/draw_diagram.py
CHANGED
@@ -7,6 +7,7 @@ from app.content import *
|
|
7 |
|
8 |
import pandas as pd
|
9 |
|
|
|
10 |
from model_information import get_dataframe
|
11 |
info_df = get_dataframe()
|
12 |
|
@@ -81,38 +82,7 @@ def draw(folder_name, category_name, displayname, metrics, cus_sort=True):
|
|
81 |
|
82 |
return df_style
|
83 |
|
84 |
-
if cur_dataset_name in
|
85 |
-
'LibriSpeech-Clean',
|
86 |
-
'LibriSpeech-Other',
|
87 |
-
'CommonVoice-15-EN',
|
88 |
-
'Peoples-Speech',
|
89 |
-
'GigaSpeech-1',
|
90 |
-
'Earnings-21',
|
91 |
-
'Earnings-22',
|
92 |
-
'TED-LIUM-3',
|
93 |
-
'TED-LIUM-3-LongForm',
|
94 |
-
'AISHELL-ASR-ZH',
|
95 |
-
'MNSC-PART1-ASR',
|
96 |
-
'MNSC-PART2-ASR',
|
97 |
-
'MNSC-PART3-ASR',
|
98 |
-
'MNSC-PART4-ASR',
|
99 |
-
'MNSC-PART5-ASR',
|
100 |
-
'MNSC-PART6-ASR',
|
101 |
-
'CNA',
|
102 |
-
'IDPC',
|
103 |
-
'Parliament',
|
104 |
-
'UKUS-News',
|
105 |
-
'Mediacorp',
|
106 |
-
'IDPC-Short',
|
107 |
-
'Parliament-Short',
|
108 |
-
'UKUS-News-Short',
|
109 |
-
'Mediacorp-Short',
|
110 |
-
'YTB-ASR-Batch1',
|
111 |
-
'YTB-ASR-Batch2',
|
112 |
-
'SEAME-Dev-Man',
|
113 |
-
'SEAME-Dev-Sge',
|
114 |
-
]:
|
115 |
-
|
116 |
chart_data_table = chart_data_table.sort_values(
|
117 |
by=chart_data_table.columns[1],
|
118 |
ascending=True
|
|
|
7 |
|
8 |
import pandas as pd
|
9 |
|
10 |
+
from app.content import wer_displayname2datasetname
|
11 |
from model_information import get_dataframe
|
12 |
info_df = get_dataframe()
|
13 |
|
|
|
82 |
|
83 |
return df_style
|
84 |
|
85 |
+
if cur_dataset_name in wer_displayname2datasetname:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
chart_data_table = chart_data_table.sort_values(
|
87 |
by=chart_data_table.columns[1],
|
88 |
ascending=True
|
app/pages.py
CHANGED
@@ -120,28 +120,12 @@ def dashboard():
|
|
120 |
""")
|
121 |
|
122 |
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
def asr_english():
|
129 |
st.title("Task: Automatic Speech Recognition - English")
|
130 |
|
131 |
sum = ['Overall']
|
132 |
-
|
133 |
-
|
134 |
-
'LibriSpeech-Other',
|
135 |
-
'CommonVoice-15-EN',
|
136 |
-
'Peoples-Speech',
|
137 |
-
'GigaSpeech-1',
|
138 |
-
'Earnings-21',
|
139 |
-
'Earnings-22',
|
140 |
-
'TED-LIUM-3',
|
141 |
-
'TED-LIUM-3-LongForm',
|
142 |
-
]
|
143 |
-
|
144 |
-
filters_levelone = sum + dataset_lists
|
145 |
|
146 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
147 |
|
@@ -156,23 +140,12 @@ def asr_english():
|
|
156 |
draw('su', 'asr_english', filter_1, 'wer', cus_sort=True)
|
157 |
|
158 |
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
def asr_singlish():
|
163 |
st.title("Task: Automatic Speech Recognition - Singlish")
|
164 |
|
165 |
sum = ['Overall']
|
166 |
-
dataset_lists = [
|
167 |
-
'MNSC-PART1-ASR',
|
168 |
-
'MNSC-PART2-ASR',
|
169 |
-
'MNSC-PART3-ASR',
|
170 |
-
'MNSC-PART4-ASR',
|
171 |
-
'MNSC-PART5-ASR',
|
172 |
-
'MNSC-PART6-ASR',
|
173 |
-
]
|
174 |
|
175 |
-
filters_levelone = sum +
|
176 |
|
177 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
178 |
|
@@ -187,17 +160,12 @@ def asr_singlish():
|
|
187 |
draw('su', 'asr_singlish', filter_1, 'wer')
|
188 |
|
189 |
|
190 |
-
|
191 |
-
|
192 |
def asr_mandarin():
|
193 |
st.title("Task: Automatic Speech Recognition - Mandarin")
|
194 |
|
195 |
sum = ['Overall']
|
196 |
-
dataset_lists = [
|
197 |
-
'AISHELL-ASR-ZH',
|
198 |
-
]
|
199 |
|
200 |
-
filters_levelone = sum +
|
201 |
|
202 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
203 |
|
@@ -211,22 +179,53 @@ def asr_mandarin():
|
|
211 |
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
|
212 |
draw('su', 'asr_mandarin', filter_1, 'wer')
|
213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
|
217 |
def speech_translation():
|
218 |
st.title("Task: Speech Translation")
|
219 |
|
220 |
sum = ['Overall']
|
221 |
-
dataset_lists = [
|
222 |
-
'CoVoST2-EN-ID',
|
223 |
-
'CoVoST2-EN-ZH',
|
224 |
-
'CoVoST2-EN-TA',
|
225 |
-
'CoVoST2-ID-EN',
|
226 |
-
'CoVoST2-ZH-EN',
|
227 |
-
'CoVoST2-TA-EN']
|
228 |
|
229 |
-
filters_levelone = sum +
|
230 |
|
231 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
232 |
|
@@ -241,22 +240,12 @@ def speech_translation():
|
|
241 |
draw('su', 'ST', filter_1, 'bleu')
|
242 |
|
243 |
|
244 |
-
|
245 |
-
|
246 |
def speech_question_answering_english():
|
247 |
st.title("Task: Spoken Question Answering - English")
|
248 |
|
249 |
sum = ['Overall']
|
250 |
|
251 |
-
|
252 |
-
'CN-College-Listen-MCQ',
|
253 |
-
'DREAM-TTS-MCQ',
|
254 |
-
'SLUE-P2-SQA5',
|
255 |
-
'Public-SG-Speech-QA',
|
256 |
-
'Spoken-SQuAD',
|
257 |
-
]
|
258 |
-
|
259 |
-
filters_levelone = sum + dataset_lists
|
260 |
|
261 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
262 |
|
@@ -276,22 +265,12 @@ def speech_question_answering_english():
|
|
276 |
draw('su', 'sqa_english', filter_1, 'llama3_70b_judge')
|
277 |
|
278 |
|
279 |
-
|
280 |
-
|
281 |
def speech_question_answering_singlish():
|
282 |
st.title("Task: Spoken Question Answering - Singlish")
|
283 |
|
284 |
sum = ['Overall']
|
285 |
|
286 |
-
|
287 |
-
'MNSC-PART3-SQA',
|
288 |
-
'MNSC-PART4-SQA',
|
289 |
-
'MNSC-PART5-SQA',
|
290 |
-
'MNSC-PART6-SQA',
|
291 |
-
]
|
292 |
-
|
293 |
-
|
294 |
-
filters_levelone = sum + dataset_lists
|
295 |
|
296 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
297 |
|
@@ -312,15 +291,7 @@ def spoken_dialogue_summarization_singlish():
|
|
312 |
|
313 |
sum = ['Overall']
|
314 |
|
315 |
-
|
316 |
-
'MNSC-PART3-SDS',
|
317 |
-
'MNSC-PART4-SDS',
|
318 |
-
'MNSC-PART5-SDS',
|
319 |
-
'MNSC-PART6-SDS',
|
320 |
-
]
|
321 |
-
|
322 |
-
|
323 |
-
filters_levelone = sum + dataset_lists
|
324 |
|
325 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
326 |
|
@@ -336,18 +307,12 @@ def spoken_dialogue_summarization_singlish():
|
|
336 |
draw('su', 'sds_singlish', filter_1, 'llama3_70b_judge')
|
337 |
|
338 |
|
339 |
-
|
340 |
-
|
341 |
def speech_instruction():
|
342 |
st.title("Task: Speech Instruction")
|
343 |
|
344 |
sum = ['Overall']
|
345 |
-
|
346 |
-
dataset_lists = ['OpenHermes-Audio',
|
347 |
-
'ALPACA-Audio',
|
348 |
-
]
|
349 |
|
350 |
-
filters_levelone = sum +
|
351 |
|
352 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
353 |
|
@@ -362,14 +327,11 @@ def speech_instruction():
|
|
362 |
draw('su', 'speech_instruction', filter_1, 'llama3_70b_judge')
|
363 |
|
364 |
|
365 |
-
|
366 |
-
|
367 |
def audio_captioning():
|
368 |
st.title("Task: Audio Captioning")
|
369 |
|
370 |
-
filters_levelone =
|
371 |
-
|
372 |
-
]
|
373 |
filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
|
374 |
|
375 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
@@ -384,18 +346,12 @@ def audio_captioning():
|
|
384 |
draw('asu', 'audio_captioning', filter_1, metric.lower().replace('-', '_'))
|
385 |
|
386 |
|
387 |
-
|
388 |
-
|
389 |
def audio_scene_question_answering():
|
390 |
st.title("Task: Audio Scene Question Answering")
|
391 |
|
392 |
sum = ['Overall']
|
393 |
-
|
394 |
-
dataset_lists = ['Clotho-AQA',
|
395 |
-
'WavCaps-QA',
|
396 |
-
'AudioCaps-QA']
|
397 |
|
398 |
-
filters_levelone = sum +
|
399 |
|
400 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
401 |
|
@@ -410,20 +366,12 @@ def audio_scene_question_answering():
|
|
410 |
draw('asu', 'audio_scene_question_answering', filter_1, 'llama3_70b_judge')
|
411 |
|
412 |
|
413 |
-
|
414 |
-
|
415 |
def emotion_recognition():
|
416 |
st.title("Task: Emotion Recognition")
|
417 |
|
418 |
sum = ['Overall']
|
419 |
|
420 |
-
|
421 |
-
'IEMOCAP-Emotion',
|
422 |
-
'MELD-Sentiment',
|
423 |
-
'MELD-Emotion',
|
424 |
-
]
|
425 |
-
|
426 |
-
filters_levelone = sum + dataset_lists
|
427 |
|
428 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
429 |
|
@@ -438,20 +386,12 @@ def emotion_recognition():
|
|
438 |
draw('vu', 'emotion_recognition', filter_1, 'llama3_70b_judge')
|
439 |
|
440 |
|
441 |
-
|
442 |
-
|
443 |
def accent_recognition():
|
444 |
st.title("Task: Accent Recognition")
|
445 |
|
446 |
sum = ['Overall']
|
447 |
-
dataset_lists = [
|
448 |
-
'VoxCeleb-Accent',
|
449 |
-
'MNSC-AR-Sentence',
|
450 |
-
'MNSC-AR-Dialogue',
|
451 |
-
]
|
452 |
-
|
453 |
|
454 |
-
filters_levelone = sum +
|
455 |
|
456 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
457 |
|
@@ -467,19 +407,12 @@ def accent_recognition():
|
|
467 |
draw('vu', 'accent_recognition', filter_1, 'llama3_70b_judge')
|
468 |
|
469 |
|
470 |
-
|
471 |
-
|
472 |
def gender_recognition():
|
473 |
st.title("Task: Gender Recognition")
|
474 |
|
475 |
sum = ['Overall']
|
476 |
|
477 |
-
|
478 |
-
'VoxCeleb-Gender',
|
479 |
-
'IEMOCAP-Gender'
|
480 |
-
]
|
481 |
-
|
482 |
-
filters_levelone = sum + dataset_lists
|
483 |
|
484 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
485 |
|
@@ -494,17 +427,12 @@ def gender_recognition():
|
|
494 |
draw('vu', 'gender_recognition', filter_1, 'llama3_70b_judge')
|
495 |
|
496 |
|
497 |
-
|
498 |
-
|
499 |
def music_understanding():
|
500 |
st.title("Task: Music Understanding - MCQ Questions")
|
501 |
|
502 |
sum = ['Overall']
|
503 |
|
504 |
-
|
505 |
-
]
|
506 |
-
|
507 |
-
filters_levelone = sum + dataset_lists
|
508 |
|
509 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
510 |
|
@@ -519,43 +447,10 @@ def music_understanding():
|
|
519 |
draw('vu', 'music_understanding', filter_1, 'llama3_70b_judge')
|
520 |
|
521 |
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
def under_development():
|
526 |
st.title("Task: Under Development")
|
527 |
-
|
528 |
|
529 |
-
|
530 |
-
'YouTube ASR: English Singapore Content',
|
531 |
-
'YouTube ASR: English with Strong Emotion',
|
532 |
-
'YouTube ASR: Malay with English Prompt',
|
533 |
-
'YouTube ASR: Malay with Malay Prompt',
|
534 |
-
'YouTube ASR: Chinese with English Prompt',
|
535 |
-
'YouTube ASR: Chinese with Chinese Prompt',
|
536 |
-
|
537 |
-
'YouTube SQA: English with Singapore Content',
|
538 |
-
'YouTube SDS: English with Singapore Content',
|
539 |
-
'YouTube PQA: English with Singapore Content',
|
540 |
-
|
541 |
-
'CNA',
|
542 |
-
'IDPC',
|
543 |
-
'Parliament',
|
544 |
-
'UKUS-News',
|
545 |
-
'Mediacorp',
|
546 |
-
'IDPC-Short',
|
547 |
-
'Parliament-Short',
|
548 |
-
'UKUS-News-Short',
|
549 |
-
'Mediacorp-Short',
|
550 |
-
|
551 |
-
'SEAME-Dev-Mandarin',
|
552 |
-
'SEAME-Dev-Singlish',
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
]
|
557 |
-
|
558 |
-
filters_levelone = dataset_lists
|
559 |
|
560 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
561 |
|
@@ -592,39 +487,8 @@ def under_development():
|
|
592 |
st.markdown('To be implemented')
|
593 |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
594 |
|
595 |
-
if filter_1 in
|
596 |
-
'CNA',
|
597 |
-
'IDPC',
|
598 |
-
'Parliament',
|
599 |
-
'UKUS-News',
|
600 |
-
'Mediacorp',
|
601 |
-
'IDPC-Short',
|
602 |
-
'Parliament-Short',
|
603 |
-
'UKUS-News-Short',
|
604 |
-
'Mediacorp-Short',
|
605 |
-
|
606 |
-
'YouTube ASR: English Singapore Content',
|
607 |
-
'YouTube ASR: English with Strong Emotion',
|
608 |
-
'YouTube ASR: Malay with English Prompt',
|
609 |
-
'YouTube ASR: Malay with Malay Prompt',
|
610 |
-
|
611 |
-
'YouTube ASR: Chinese with English Prompt',
|
612 |
-
'YouTube ASR: Chinese with Chinese Prompt',
|
613 |
-
|
614 |
-
'SEAME-Dev-Mandarin',
|
615 |
-
'SEAME-Dev-Singlish',
|
616 |
-
]:
|
617 |
-
|
618 |
draw('vu', 'under_development_wer', filter_1, 'wer')
|
619 |
|
620 |
-
elif filter_1 in
|
621 |
-
'YouTube SQA: English with Singapore Content',
|
622 |
-
'YouTube SDS: English with Singapore Content',
|
623 |
-
'YouTube PQA: English with Singapore Content',
|
624 |
-
]:
|
625 |
draw('vu', 'under_development_llama3_70b_judge', filter_1, 'llama3_70b_judge')
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
|
|
120 |
""")
|
121 |
|
122 |
|
|
|
|
|
|
|
|
|
|
|
123 |
def asr_english():
|
124 |
st.title("Task: Automatic Speech Recognition - English")
|
125 |
|
126 |
sum = ['Overall']
|
127 |
+
|
128 |
+
filters_levelone = sum + asr_english_datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
|
130 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
131 |
|
|
|
140 |
draw('su', 'asr_english', filter_1, 'wer', cus_sort=True)
|
141 |
|
142 |
|
|
|
|
|
|
|
143 |
def asr_singlish():
|
144 |
st.title("Task: Automatic Speech Recognition - Singlish")
|
145 |
|
146 |
sum = ['Overall']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
+
filters_levelone = sum + asr_singlish_datasets
|
149 |
|
150 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
151 |
|
|
|
160 |
draw('su', 'asr_singlish', filter_1, 'wer')
|
161 |
|
162 |
|
|
|
|
|
163 |
def asr_mandarin():
|
164 |
st.title("Task: Automatic Speech Recognition - Mandarin")
|
165 |
|
166 |
sum = ['Overall']
|
|
|
|
|
|
|
167 |
|
168 |
+
filters_levelone = sum + asr_mandarin_datasets
|
169 |
|
170 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
171 |
|
|
|
179 |
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
|
180 |
draw('su', 'asr_mandarin', filter_1, 'wer')
|
181 |
|
182 |
+
|
183 |
+
def asr_sea():
|
184 |
+
st.title("Task: Automatic Speech Recognition - SEA Languages")
|
185 |
+
|
186 |
+
sum = ['Overall']
|
187 |
+
|
188 |
+
filters_levelone = sum + asr_sea_datasets
|
189 |
+
|
190 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
191 |
+
|
192 |
+
with left:
|
193 |
+
filter_1 = st.selectbox('Dataset', filters_levelone)
|
194 |
+
|
195 |
+
if filter_1:
|
196 |
+
if filter_1 in sum:
|
197 |
+
sum_table_mulit_metrix('asr_sea', ['wer'])
|
198 |
+
else:
|
199 |
+
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
|
200 |
+
draw('su', 'asr_sea', filter_1, 'wer')
|
201 |
+
|
202 |
+
|
203 |
+
def asr_private():
|
204 |
+
st.title("Task: Automatic Speech Recognition - Private Datasets")
|
205 |
+
|
206 |
+
sum = ['Overall']
|
207 |
+
|
208 |
+
filters_levelone = sum + asr_private_datasets
|
209 |
|
210 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
211 |
+
|
212 |
+
with left:
|
213 |
+
filter_1 = st.selectbox('Dataset', filters_levelone)
|
214 |
+
|
215 |
+
if filter_1:
|
216 |
+
if filter_1 in sum:
|
217 |
+
sum_table_mulit_metrix('asr_private', ['wer'])
|
218 |
+
else:
|
219 |
+
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
|
220 |
+
draw('su', 'asr_private', filter_1, 'wer')
|
221 |
|
222 |
|
223 |
def speech_translation():
|
224 |
st.title("Task: Speech Translation")
|
225 |
|
226 |
sum = ['Overall']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
227 |
|
228 |
+
filters_levelone = sum + speech_translation_datasets
|
229 |
|
230 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
231 |
|
|
|
240 |
draw('su', 'ST', filter_1, 'bleu')
|
241 |
|
242 |
|
|
|
|
|
243 |
def speech_question_answering_english():
|
244 |
st.title("Task: Spoken Question Answering - English")
|
245 |
|
246 |
sum = ['Overall']
|
247 |
|
248 |
+
filters_levelone = sum + speech_qa_english_datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
249 |
|
250 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
251 |
|
|
|
265 |
draw('su', 'sqa_english', filter_1, 'llama3_70b_judge')
|
266 |
|
267 |
|
|
|
|
|
268 |
def speech_question_answering_singlish():
|
269 |
st.title("Task: Spoken Question Answering - Singlish")
|
270 |
|
271 |
sum = ['Overall']
|
272 |
|
273 |
+
filters_levelone = sum + speech_qa_singlish_datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
275 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
276 |
|
|
|
291 |
|
292 |
sum = ['Overall']
|
293 |
|
294 |
+
filters_levelone = sum + sds_datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
297 |
|
|
|
307 |
draw('su', 'sds_singlish', filter_1, 'llama3_70b_judge')
|
308 |
|
309 |
|
|
|
|
|
310 |
def speech_instruction():
|
311 |
st.title("Task: Speech Instruction")
|
312 |
|
313 |
sum = ['Overall']
|
|
|
|
|
|
|
|
|
314 |
|
315 |
+
filters_levelone = sum + si_datasets
|
316 |
|
317 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
318 |
|
|
|
327 |
draw('su', 'speech_instruction', filter_1, 'llama3_70b_judge')
|
328 |
|
329 |
|
|
|
|
|
330 |
def audio_captioning():
|
331 |
st.title("Task: Audio Captioning")
|
332 |
|
333 |
+
filters_levelone = ac_datasets
|
334 |
+
|
|
|
335 |
filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
|
336 |
|
337 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
|
|
346 |
draw('asu', 'audio_captioning', filter_1, metric.lower().replace('-', '_'))
|
347 |
|
348 |
|
|
|
|
|
349 |
def audio_scene_question_answering():
|
350 |
st.title("Task: Audio Scene Question Answering")
|
351 |
|
352 |
sum = ['Overall']
|
|
|
|
|
|
|
|
|
353 |
|
354 |
+
filters_levelone = sum + asqa_datasets
|
355 |
|
356 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
357 |
|
|
|
366 |
draw('asu', 'audio_scene_question_answering', filter_1, 'llama3_70b_judge')
|
367 |
|
368 |
|
|
|
|
|
369 |
def emotion_recognition():
|
370 |
st.title("Task: Emotion Recognition")
|
371 |
|
372 |
sum = ['Overall']
|
373 |
|
374 |
+
filters_levelone = sum + er_datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
375 |
|
376 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
377 |
|
|
|
386 |
draw('vu', 'emotion_recognition', filter_1, 'llama3_70b_judge')
|
387 |
|
388 |
|
|
|
|
|
389 |
def accent_recognition():
|
390 |
st.title("Task: Accent Recognition")
|
391 |
|
392 |
sum = ['Overall']
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
|
394 |
+
filters_levelone = sum + ar_datasets
|
395 |
|
396 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
397 |
|
|
|
407 |
draw('vu', 'accent_recognition', filter_1, 'llama3_70b_judge')
|
408 |
|
409 |
|
|
|
|
|
410 |
def gender_recognition():
|
411 |
st.title("Task: Gender Recognition")
|
412 |
|
413 |
sum = ['Overall']
|
414 |
|
415 |
+
filters_levelone = sum + gr_datasets
|
|
|
|
|
|
|
|
|
|
|
416 |
|
417 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
418 |
|
|
|
427 |
draw('vu', 'gender_recognition', filter_1, 'llama3_70b_judge')
|
428 |
|
429 |
|
|
|
|
|
430 |
def music_understanding():
|
431 |
st.title("Task: Music Understanding - MCQ Questions")
|
432 |
|
433 |
sum = ['Overall']
|
434 |
|
435 |
+
filters_levelone = sum + music_datasets
|
|
|
|
|
|
|
436 |
|
437 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
438 |
|
|
|
447 |
draw('vu', 'music_understanding', filter_1, 'llama3_70b_judge')
|
448 |
|
449 |
|
|
|
|
|
|
|
450 |
def under_development():
|
451 |
st.title("Task: Under Development")
|
|
|
452 |
|
453 |
+
filters_levelone = non_wer_development_datasets + wer_development_datasets
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
454 |
|
455 |
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
456 |
|
|
|
487 |
st.markdown('To be implemented')
|
488 |
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
|
489 |
|
490 |
+
if filter_1 in wer_development_datasets:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
491 |
draw('vu', 'under_development_wer', filter_1, 'wer')
|
492 |
|
493 |
+
elif filter_1 in non_wer_development_datasets:
|
|
|
|
|
|
|
|
|
494 |
draw('vu', 'under_development_llama3_70b_judge', filter_1, 'llama3_70b_judge')
|
|
|
|
|
|
|
|
|
|
app/summarization.py
CHANGED
@@ -27,7 +27,6 @@ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
|
|
27 |
chart_data = one_chart_data
|
28 |
else:
|
29 |
chart_data = pd.merge(chart_data, one_chart_data, on='Model', how='outer')
|
30 |
-
|
31 |
|
32 |
selected_columns = [i for i in chart_data.columns if i != 'Model']
|
33 |
chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
|
@@ -79,7 +78,8 @@ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
|
|
79 |
|
80 |
|
81 |
# Format numeric columns to 2 decimal places
|
82 |
-
|
|
|
83 |
|
84 |
if metrics in ['wer']:
|
85 |
ascend = True
|
|
|
27 |
chart_data = one_chart_data
|
28 |
else:
|
29 |
chart_data = pd.merge(chart_data, one_chart_data, on='Model', how='outer')
|
|
|
30 |
|
31 |
selected_columns = [i for i in chart_data.columns if i != 'Model']
|
32 |
chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
|
|
|
78 |
|
79 |
|
80 |
# Format numeric columns to 2 decimal places
|
81 |
+
target_column = chart_data_table.columns[1]
|
82 |
+
chart_data_table.loc[:, target_column] = chart_data_table[target_column].apply(lambda x: round(float(x), 3) if isinstance(float(x), (int, float)) else float(x))
|
83 |
|
84 |
if metrics in ['wer']:
|
85 |
ascend = True
|
model_information.py
CHANGED
@@ -36,6 +36,30 @@ data['Original Name'].append('MERaLiON-AudioLLM-Whisper-SEA-LION')
|
|
36 |
data['Proper Display Name'].append('Fusion: MERaLiON-AudioLLM-Whisper-SEA-LION')
|
37 |
data['Link'].append('https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION')
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
data['Original Name'].append('cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct')
|
40 |
data['Proper Display Name'].append('Cascade: Whisper-Large-v2 / SEA-LIONv3')
|
41 |
data['Link'].append('https://github.com/aisingapore/sealion')
|
@@ -44,7 +68,6 @@ data['Original Name'].append('whisper_large_v3')
|
|
44 |
data['Proper Display Name'].append('Whisper-large-v3')
|
45 |
data['Link'].append('https://huggingface.co/openai/whisper-large-v3')
|
46 |
|
47 |
-
|
48 |
data['Original Name'].append('gemini-1.5-flash')
|
49 |
data['Proper Display Name'].append('Gemini-1.5-Flash')
|
50 |
data['Link'].append('https://ai.google.dev/gemini-api/docs/models/gemini')
|
|
|
36 |
data['Proper Display Name'].append('Fusion: MERaLiON-AudioLLM-Whisper-SEA-LION')
|
37 |
data['Link'].append('https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION')
|
38 |
|
39 |
+
data['Original Name'].append('MERaLiON-AudioLLM-v2-2b')
|
40 |
+
data['Proper Display Name'].append('Fusion: MERaLiON-2-3B')
|
41 |
+
data['Link'].append('https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION')
|
42 |
+
|
43 |
+
data['Original Name'].append('MERaLiON-AudioLLM-v2-9b')
|
44 |
+
data['Proper Display Name'].append('Fusion: MERaLiON-2-10B')
|
45 |
+
data['Link'].append('https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION')
|
46 |
+
|
47 |
+
data['Original Name'].append('MERaLiON-AudioLLM-v2-9b-asr')
|
48 |
+
data['Proper Display Name'].append('Fusion: MERaLiON-2-10B-ASR')
|
49 |
+
data['Link'].append('https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION')
|
50 |
+
|
51 |
+
data['Original Name'].append('phi_4_multimodal_instruct')
|
52 |
+
data['Proper Display Name'].append('Fusion: Phi-4-multimodal-instruct')
|
53 |
+
data['Link'].append('https://huggingface.co/microsoft/Phi-4-multimodal-instruct')
|
54 |
+
|
55 |
+
data['Original Name'].append('Qwen2.5-Omni-3B')
|
56 |
+
data['Proper Display Name'].append('Fusion: Qwen2.5-Omni-3B')
|
57 |
+
data['Link'].append('https://huggingface.co/Qwen/Qwen2.5-Omni-3B')
|
58 |
+
|
59 |
+
data['Original Name'].append('Qwen2.5-Omni-7B')
|
60 |
+
data['Proper Display Name'].append('Fusion: Qwen2.5-Omni-7B')
|
61 |
+
data['Link'].append('https://huggingface.co/Qwen/Qwen2.5-Omni-7B')
|
62 |
+
|
63 |
data['Original Name'].append('cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct')
|
64 |
data['Proper Display Name'].append('Cascade: Whisper-Large-v2 / SEA-LIONv3')
|
65 |
data['Link'].append('https://github.com/aisingapore/sealion')
|
|
|
68 |
data['Proper Display Name'].append('Whisper-large-v3')
|
69 |
data['Link'].append('https://huggingface.co/openai/whisper-large-v3')
|
70 |
|
|
|
71 |
data['Original Name'].append('gemini-1.5-flash')
|
72 |
data['Proper Display Name'].append('Gemini-1.5-Flash')
|
73 |
data['Link'].append('https://ai.google.dev/gemini-api/docs/models/gemini')
|
results_organized/bleu/st.csv
CHANGED
@@ -1,12 +1,17 @@
|
|
1 |
Model,covost2_en_id_test,covost2_en_zh_test,covost2_en_ta_test,covost2_id_en_test,covost2_zh_en_test,covost2_ta_en_test
|
2 |
-
Qwen-Audio-Chat,4.102230932924371,15.330641138043728,0.
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
whisper_large_v3,1.600581653970121,0.16408986541757878,0.02107778621423822,46.01512198258627,14.673689493155793,2.451098639578599
|
7 |
old_models,,,,,,
|
8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,27.620150160643625,35.274306071307024,8.433062902024755,46.80524126004861,15.209998552437538,2.8327095799289337
|
9 |
gemini-1.5-flash,,,,,,
|
10 |
-
WavLLM_fairseq,13.841886973016162,31.96381187282953,0.
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Model,covost2_en_id_test,covost2_en_zh_test,covost2_en_ta_test,covost2_id_en_test,covost2_zh_en_test,covost2_ta_en_test
|
2 |
+
Qwen-Audio-Chat,4.102230932924371,15.330641138043728,0.0345148380723629,0.4564861971472884,9.898238298955656,0.0169914430109318
|
3 |
+
hy_whisper_local_cs,1.0869208512565696,0.1057326962921535,0.0089505165494316,22.267131378964944,7.31707791416422,2.8610263518826757
|
4 |
+
Qwen2-Audio-7B-Instruct,16.325186897428104,25.765420247070075,0.0324597207187291,6.326113431899141,16.466557744958333,0.0442583814605029
|
5 |
+
whisper_large_v3,1.600581653970121,0.1640898654175787,0.0210777862142382,46.01512198258627,14.673689493155791,2.451098639578599
|
|
|
6 |
old_models,,,,,,
|
|
|
7 |
gemini-1.5-flash,,,,,,
|
8 |
+
WavLLM_fairseq,13.841886973016162,31.96381187282953,0.0033159224040994,5.933522277713613,2.368659001743569,0.1695522548322915
|
9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,37.058238343330466,43.96331874536172,13.808713343771569,43.37364836260576,19.55610418584389,4.758175879451736
|
10 |
+
MERaLiON-AudioLLM-v2-2b,30.658188021678257,40.02820084309168,5.601731502002274,37.77329494766737,16.777825775562142,1.9423083468131173
|
11 |
+
MERaLiON-AudioLLM-v2-9b,36.242124109428445,43.747307981166834,10.885517678613343,47.85937752036512,22.133726547487697,3.4786390367027833
|
12 |
+
Qwen2.5-Omni-3B,3.2577143149506815,10.28866767786604,0.020665917336912663,15.00712601210481,8.98152195711894,0.04161842995351044
|
13 |
+
Qwen2.5-Omni-7B,2.612412992528698,12.429229982446326,0.05482974047730791,12.471476026200369,9.974234734341179,0.02999794683579762
|
14 |
+
SALMONN_7B,14.193483776951359,33.255550227097565,0.0005121531999434492,27.88515689237341,5.175547389931541,0.40577007761551664
|
15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,27.59161630015759,28.71368811388653,7.474730798912167,46.80524126004861,15.209998552437538,2.8327095799289337
|
16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,10.753313930099422,6.089840198985321,1.0029597453865848,46.79744652156276,14.156349261775734,2.4177196689141547
|
17 |
+
phi_4_multimodal_instruct,14.553644350540432,45.48015814069248,0.14817117451495013,0.37716244197757426,22.330318273444895,0.07320611681035753
|
results_organized/llama3_70b_judge/accent_recognition.csv
CHANGED
@@ -1,12 +1,17 @@
|
|
1 |
Model,voxceleb_accent_test,imda_ar_sentence,imda_ar_dialogue
|
2 |
Qwen-Audio-Chat,48.05088223225277,3.933333333333333,0.6666666666666667
|
3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,47.01682396389003,7.816666666666666,77.83333333333333
|
4 |
hy_whisper_local_cs,,,
|
5 |
-
Qwen2-Audio-7B-Instruct,29.187525646286417,2.55,0.
|
6 |
whisper_large_v3,,,
|
7 |
old_models,,,
|
8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,24.640951990151827,26.016666666666666,7.633333333333334
|
9 |
gemini-1.5-flash,,,
|
10 |
-
WavLLM_fairseq,39.96717275338531,2.6833333333333336,0.
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Model,voxceleb_accent_test,imda_ar_sentence,imda_ar_dialogue
|
2 |
Qwen-Audio-Chat,48.05088223225277,3.933333333333333,0.6666666666666667
|
|
|
3 |
hy_whisper_local_cs,,,
|
4 |
+
Qwen2-Audio-7B-Instruct,29.187525646286417,2.55,0.9666666666666668
|
5 |
whisper_large_v3,,,
|
6 |
old_models,,,
|
|
|
7 |
gemini-1.5-flash,,,
|
8 |
+
WavLLM_fairseq,39.96717275338531,2.6833333333333336,0.2333333333333333
|
9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,47.066064833812064,,
|
10 |
+
MERaLiON-AudioLLM-v2-2b,66.59827656955272,,
|
11 |
+
MERaLiON-AudioLLM-v2-9b,40.78785391875257,,
|
12 |
+
Qwen2.5-Omni-3B,0.9027492819039803,,
|
13 |
+
Qwen2.5-Omni-7B,1.661879359868691,,
|
14 |
+
SALMONN_7B,31.69881001231022,,
|
15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,28.00574476815757,,
|
16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,40.29544521953221,,
|
17 |
+
phi_4_multimodal_instruct,2.6261797291752154,,
|
results_organized/llama3_70b_judge/audio_captioning.csv
CHANGED
@@ -1,12 +1,17 @@
|
|
1 |
Model,audiocaps_test,wavcaps_test
|
2 |
Qwen-Audio-Chat,47.04090909090909,32.9364161849711
|
3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,38.00454545454545,33.97687861271676
|
4 |
hy_whisper_local_cs,,
|
5 |
Qwen2-Audio-7B-Instruct,40.77727272727273,33.78034682080925
|
6 |
whisper_large_v3,,
|
7 |
old_models,,
|
8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,3.0954545454545457,6.3468208092485545
|
9 |
gemini-1.5-flash,,
|
10 |
WavLLM_fairseq,5.5,6.901734104046243
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Model,audiocaps_test,wavcaps_test
|
2 |
Qwen-Audio-Chat,47.04090909090909,32.9364161849711
|
|
|
3 |
hy_whisper_local_cs,,
|
4 |
Qwen2-Audio-7B-Instruct,40.77727272727273,33.78034682080925
|
5 |
whisper_large_v3,,
|
6 |
old_models,,
|
|
|
7 |
gemini-1.5-flash,,
|
8 |
WavLLM_fairseq,5.5,6.901734104046243
|
9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,39.38636363636363,34.566473988439306
|
10 |
+
MERaLiON-AudioLLM-v2-2b,35.07727272727273,31.410404624277458
|
11 |
+
MERaLiON-AudioLLM-v2-9b,36.04090909090909,35.16763005780347
|
12 |
+
Qwen2.5-Omni-3B,43.69545454545454,34.70520231213873
|
13 |
+
Qwen2.5-Omni-7B,37.7,26.09248554913295
|
14 |
+
SALMONN_7B,35.24090909090909,22.520231213872833
|
15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,2.4545454545454546,3.8265895953757223
|
16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,2.5136363636363637,3.3179190751445087
|
17 |
+
phi_4_multimodal_instruct,33.595454545454544,28.069364161849713
|
results_organized/llama3_70b_judge/audio_scene_question_answering.csv
CHANGED
@@ -1,12 +1,17 @@
|
|
1 |
Model,clotho_aqa_test,audiocaps_qa_test,wavcaps_qa_test
|
2 |
Qwen-Audio-Chat,61.934856587263,50.22364217252396,42.69736842105263
|
3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,63.15021876519203,49.77635782747604,46.31578947368421
|
4 |
hy_whisper_local_cs,,,
|
5 |
Qwen2-Audio-7B-Instruct,50.919591292758774,45.75079872204473,44.473684210526315
|
6 |
whisper_large_v3,,,
|
7 |
old_models,,,
|
8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,24.647544968400585,18.466453674121407,18.88157894736842
|
9 |
gemini-1.5-flash,,,
|
10 |
WavLLM_fairseq,43.01199466903598,29.840255591054312,26.25
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Model,clotho_aqa_test,audiocaps_qa_test,wavcaps_qa_test
|
2 |
Qwen-Audio-Chat,61.934856587263,50.22364217252396,42.69736842105263
|
|
|
3 |
hy_whisper_local_cs,,,
|
4 |
Qwen2-Audio-7B-Instruct,50.919591292758774,45.75079872204473,44.473684210526315
|
5 |
whisper_large_v3,,,
|
6 |
old_models,,,
|
|
|
7 |
gemini-1.5-flash,,,
|
8 |
WavLLM_fairseq,43.01199466903598,29.840255591054312,26.25
|
9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,62.67379679144385,48.81789137380192,45.131578947368425
|
10 |
+
MERaLiON-AudioLLM-v2-2b,50.53962080700049,44.79233226837061,43.0921052631579
|
11 |
+
MERaLiON-AudioLLM-v2-9b,58.20126397666505,50.35143769968051,44.868421052631575
|
12 |
+
Qwen2.5-Omni-3B,52.64948954788527,48.56230031948882,43.15789473684211
|
13 |
+
Qwen2.5-Omni-7B,46.592124453087024,50.41533546325879,40.0
|
14 |
+
SALMONN_7B,58.19154107924162,50.35143769968051,46.90789473684211
|
15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,22.673796791443852,17.44408945686901,14.013157894736842
|
16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,29.820126397666506,17.06070287539936,18.75
|
17 |
+
phi_4_multimodal_instruct,48.37141468157511,40.319488817891376,37.96052631578947
|
results_organized/llama3_70b_judge/emotion_recognition.csv
CHANGED
@@ -1,12 +1,17 @@
|
|
1 |
Model,iemocap_emotion_test,meld_sentiment_test,meld_emotion_test
|
2 |
Qwen-Audio-Chat,29.382470119521916,44.90421455938697,50.72796934865901
|
3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,48.505976095617534,46.206896551724135,36.36015325670498
|
4 |
hy_whisper_local_cs,,,
|
5 |
Qwen2-Audio-7B-Instruct,53.98406374501992,53.9463601532567,41.60919540229885
|
6 |
whisper_large_v3,,,
|
7 |
old_models,,,
|
8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,44.322709163346616,56.59003831417625,47.356321839080465
|
9 |
gemini-1.5-flash,,,
|
10 |
WavLLM_fairseq,59.76095617529881,51.072796934865906,41.57088122605364
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Model,iemocap_emotion_test,meld_sentiment_test,meld_emotion_test
|
2 |
Qwen-Audio-Chat,29.382470119521916,44.90421455938697,50.72796934865901
|
|
|
3 |
hy_whisper_local_cs,,,
|
4 |
Qwen2-Audio-7B-Instruct,53.98406374501992,53.9463601532567,41.60919540229885
|
5 |
whisper_large_v3,,,
|
6 |
old_models,,,
|
|
|
7 |
gemini-1.5-flash,,,
|
8 |
WavLLM_fairseq,59.76095617529881,51.072796934865906,41.57088122605364
|
9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,49.103585657370516,52.452107279693486,44.17624521072797
|
10 |
+
MERaLiON-AudioLLM-v2-2b,51.39442231075698,58.582375478927204,52.1455938697318
|
11 |
+
MERaLiON-AudioLLM-v2-9b,62.54980079681275,68.85057471264368,59.808429118773944
|
12 |
+
Qwen2.5-Omni-3B,34.36254980079681,30.421455938697317,34.32950191570881
|
13 |
+
Qwen2.5-Omni-7B,36.55378486055777,27.77777777777778,30.07662835249042
|
14 |
+
SALMONN_7B,26.195219123505975,42.26053639846744,32.298850574712645
|
15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,41.98207171314741,58.39080459770115,44.272030651341
|
16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,46.91235059760957,56.47509578544061,49.42528735632184
|
17 |
+
phi_4_multimodal_instruct,32.07171314741036,49.11877394636016,40.84291187739464
|
results_organized/llama3_70b_judge/gender_recognition.csv
CHANGED
@@ -1,12 +1,17 @@
|
|
1 |
-
Model,voxceleb_gender_test,iemocap_gender_test
|
2 |
-
Qwen-Audio-Chat,70.5990972507181,50.0996015936255
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,voxceleb_gender_test,iemocap_gender_test
|
2 |
+
Qwen-Audio-Chat,70.5990972507181,50.0996015936255
|
3 |
+
hy_whisper_local_cs,,
|
4 |
+
Qwen2-Audio-7B-Instruct,99.1177677472302,92.80876494023904
|
5 |
+
whisper_large_v3,,
|
6 |
+
old_models,,
|
7 |
+
gemini-1.5-flash,,
|
8 |
+
WavLLM_fairseq,69.61427985227739,51.932270916334666
|
9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,99.73327862125564,94.6215139442231
|
10 |
+
MERaLiON-AudioLLM-v2-2b,99.69224456298728,87.92828685258964
|
11 |
+
MERaLiON-AudioLLM-v2-9b,97.2507180960197,92.96812749003983
|
12 |
+
Qwen2.5-Omni-3B,32.78621255642183,62.948207171314735
|
13 |
+
Qwen2.5-Omni-7B,54.08288879770209,43.366533864541836
|
14 |
+
SALMONN_7B,88.53098071399262,80.199203187251
|
15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,26.631103816167418,12.211155378486056
|
16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,69.69634796881412,44.38247011952191
|
17 |
+
phi_4_multimodal_instruct,94.58350430857611,46.852589641434264
|
results_organized/llama3_70b_judge/music_understanding.csv
CHANGED
@@ -1,12 +1,17 @@
|
|
1 |
Model,muchomusic_test
|
2 |
Qwen-Audio-Chat,59.0564448188711
|
3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,57.7927548441449
|
4 |
hy_whisper_local_cs,
|
5 |
Qwen2-Audio-7B-Instruct,71.60909856781802
|
6 |
whisper_large_v3,
|
7 |
old_models,
|
8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,51.727042965459134
|
9 |
gemini-1.5-flash,
|
10 |
WavLLM_fairseq,44.3133951137321
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Model,muchomusic_test
|
2 |
Qwen-Audio-Chat,59.0564448188711
|
|
|
3 |
hy_whisper_local_cs,
|
4 |
Qwen2-Audio-7B-Instruct,71.60909856781802
|
5 |
whisper_large_v3,
|
6 |
old_models,
|
|
|
7 |
gemini-1.5-flash,
|
8 |
WavLLM_fairseq,44.3133951137321
|
9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,51.34793597304128
|
10 |
+
MERaLiON-AudioLLM-v2-2b,55.602358887952825
|
11 |
+
MERaLiON-AudioLLM-v2-9b,63.94271272114573
|
12 |
+
Qwen2.5-Omni-3B,59.30918281381634
|
13 |
+
Qwen2.5-Omni-7B,47.598989048020215
|
14 |
+
SALMONN_7B,49.70513900589722
|
15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,50.463352990732936
|
16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,56.697556866048856
|
17 |
+
phi_4_multimodal_instruct,55.2653748946925
|
results_organized/llama3_70b_judge/sds_singlish.csv
CHANGED
@@ -1,12 +1,17 @@
|
|
1 |
Model,imda_part3_30s_ds_human_test,imda_part4_30s_ds_human_test,imda_part5_30s_ds_human_test,imda_part6_30s_ds_human_test
|
2 |
Qwen-Audio-Chat,16.4,16.0,28.2,40.4
|
3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,48.4,46.4,57.0,62.599999999999994
|
4 |
hy_whisper_local_cs,,,,
|
5 |
Qwen2-Audio-7B-Instruct,33.8,24.8,40.4,46.2
|
6 |
whisper_large_v3,,,,
|
7 |
old_models,,,,
|
8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,45.4,44.0,58.0,65.4
|
9 |
gemini-1.5-flash,,,,
|
10 |
-
WavLLM_fairseq,31.6,31.6,45.
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Model,imda_part3_30s_ds_human_test,imda_part4_30s_ds_human_test,imda_part5_30s_ds_human_test,imda_part6_30s_ds_human_test
|
2 |
Qwen-Audio-Chat,16.4,16.0,28.2,40.4
|
|
|
3 |
hy_whisper_local_cs,,,,
|
4 |
Qwen2-Audio-7B-Instruct,33.8,24.8,40.4,46.2
|
5 |
whisper_large_v3,,,,
|
6 |
old_models,,,,
|
|
|
7 |
gemini-1.5-flash,,,,
|
8 |
+
WavLLM_fairseq,31.6,31.6,45.2,49.400000000000006
|
9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,47.800000000000004,46.4,54.6,65.6
|
10 |
+
MERaLiON-AudioLLM-v2-2b,42.2,40.199999999999996,51.8,60.0
|
11 |
+
MERaLiON-AudioLLM-v2-9b,49.8,46.6,55.4,60.599999999999994
|
12 |
+
Qwen2.5-Omni-3B,42.800000000000004,33.199999999999996,52.199999999999996,58.8
|
13 |
+
Qwen2.5-Omni-7B,39.8,31.6,42.800000000000004,58.4
|
14 |
+
SALMONN_7B,9.0,7.4,16.0,25.2
|
15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,48.4,45.599999999999994,53.4,56.6
|
16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,38.0,38.199999999999996,46.2,61.0
|
17 |
+
phi_4_multimodal_instruct,43.6,42.800000000000004,55.599999999999994,61.0
|
results_organized/llama3_70b_judge/speech_instruction.csv
CHANGED
@@ -1,12 +1,17 @@
|
|
1 |
Model,openhermes_audio_test,alpaca_audio_test
|
2 |
-
Qwen-Audio-Chat,10.
|
3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,65.6,74.80000000000001
|
4 |
hy_whisper_local_cs,,
|
5 |
-
Qwen2-Audio-7B-Instruct,44.
|
6 |
whisper_large_v3,,
|
7 |
old_models,,
|
8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,72.2,73.8
|
9 |
gemini-1.5-flash,,
|
10 |
WavLLM_fairseq,19.2,21.6
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Model,openhermes_audio_test,alpaca_audio_test
|
2 |
+
Qwen-Audio-Chat,10.6,9.8
|
|
|
3 |
hy_whisper_local_cs,,
|
4 |
+
Qwen2-Audio-7B-Instruct,44.8,52.599999999999994
|
5 |
whisper_large_v3,,
|
6 |
old_models,,
|
|
|
7 |
gemini-1.5-flash,,
|
8 |
WavLLM_fairseq,19.2,21.6
|
9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,66.39999999999999,75.19999999999999
|
10 |
+
MERaLiON-AudioLLM-v2-2b,12.6,25.6
|
11 |
+
MERaLiON-AudioLLM-v2-9b,66.2,74.2
|
12 |
+
Qwen2.5-Omni-3B,66.0,64.0
|
13 |
+
Qwen2.5-Omni-7B,57.400000000000006,59.2
|
14 |
+
SALMONN_7B,15.4,10.4
|
15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,78.8,67.0
|
16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,62.800000000000004,69.4
|
17 |
+
phi_4_multimodal_instruct,39.0,33.4
|
results_organized/llama3_70b_judge/sqa_english.csv
CHANGED
@@ -1,12 +1,15 @@
|
|
1 |
-
Model,slue_p2_sqa5_test,public_sg_speech_qa_test,spoken_squad_test,cn_college_listen_mcq_test,dream_tts_mcq_test
|
2 |
-
Qwen-Audio-Chat,79.36274509803921,63.16860465116279,64.8327415436367,63.
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
1 |
+
Model,slue_p2_sqa5_test,public_sg_speech_qa_test,spoken_squad_test,cn_college_listen_mcq_test,dream_tts_mcq_test,mmau_mini
|
2 |
+
Qwen-Audio-Chat,79.36274509803921,63.16860465116279,64.8327415436367,63.23205636283576,59.749085206481965,
|
3 |
+
hy_whisper_local_cs,,,,,,
|
4 |
+
Qwen2-Audio-7B-Instruct,80.04901960784315,58.31395348837209,64.86264249672958,74.7247908410392,66.49242028227914,
|
5 |
+
whisper_large_v3,,,,,,
|
6 |
+
old_models,,,,,,
|
7 |
+
gemini-1.5-flash,,,,89.25583443416997,,
|
8 |
+
WavLLM_fairseq,83.92156862745098,58.54651162790698,77.64903756307233,66.31439894319684,66.5446941975954,
|
9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,86.7156862745098,59.59302325581396,74.20669033825453,57.11140466754734,51.54208050182959,53.1
|
10 |
+
MERaLiON-AudioLLM-v2-2b,83.18627450980392,69.47674418604652,81.4614090824145,66.00616468516073,61.16048092002091,50.99999999999999
|
11 |
+
MERaLiON-AudioLLM-v2-9b,89.55882352941177,75.02906976744187,89.20949355260699,84.58828709819463,83.32462101411396,56.699999999999996
|
12 |
+
SALMONN_7B,80.88235294117646,59.38953488372093,65.64754251541768,50.81461911052399,56.56037637219028,50.6
|
13 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,51.51960784313726,70.93023255813954,57.16314707531303,89.52003522677234,85.15420805018296,52.6
|
14 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,86.96078431372548,69.68023255813954,87.43412446271725,84.98458828709819,86.1996863565081,55.900000000000006
|
15 |
+
phi_4_multimodal_instruct,83.72549019607844,74.18604651162791,83.19566436180153,75.6494936151475,77.5222164140094,58.8
|
results_organized/llama3_70b_judge/sqa_singlish.csv
CHANGED
@@ -1,12 +1,17 @@
|
|
1 |
Model,imda_part3_30s_sqa_human_test,imda_part4_30s_sqa_human_test,imda_part5_30s_sqa_human_test,imda_part6_30s_sqa_human_test
|
2 |
-
Qwen-Audio-Chat,32.2,37.8,47.
|
3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,51.4,53.2,64.80000000000001,67.2
|
4 |
hy_whisper_local_cs,,,,
|
5 |
Qwen2-Audio-7B-Instruct,42.0,39.6,51.6,53.6
|
6 |
whisper_large_v3,,,,
|
7 |
old_models,,,,
|
8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,56.0,66.0,74.0,71.6
|
9 |
gemini-1.5-flash,,,,
|
10 |
-
WavLLM_fairseq,45.
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Model,imda_part3_30s_sqa_human_test,imda_part4_30s_sqa_human_test,imda_part5_30s_sqa_human_test,imda_part6_30s_sqa_human_test
|
2 |
+
Qwen-Audio-Chat,32.2,37.8,47.8,51.4
|
|
|
3 |
hy_whisper_local_cs,,,,
|
4 |
Qwen2-Audio-7B-Instruct,42.0,39.6,51.6,53.6
|
5 |
whisper_large_v3,,,,
|
6 |
old_models,,,,
|
|
|
7 |
gemini-1.5-flash,,,,
|
8 |
+
WavLLM_fairseq,45.2,46.6,50.8,62.2
|
9 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,55.199999999999996,50.0,63.0,67.4
|
10 |
+
MERaLiON-AudioLLM-v2-2b,52.599999999999994,54.6,61.4,70.19999999999999
|
11 |
+
MERaLiON-AudioLLM-v2-9b,59.400000000000006,63.0,72.0,71.8
|
12 |
+
Qwen2.5-Omni-3B,52.400000000000006,54.400000000000006,66.0,69.2
|
13 |
+
Qwen2.5-Omni-7B,54.2,52.0,62.800000000000004,64.6
|
14 |
+
SALMONN_7B,42.0,35.4,45.8,49.6
|
15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,51.4,46.4,54.6,62.599999999999994
|
16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,51.6,55.599999999999994,62.0,68.2
|
17 |
+
phi_4_multimodal_instruct,55.0,56.4,64.6,71.8
|
results_organized/llama3_70b_judge/under_development_llama3_70b_judge.csv
CHANGED
@@ -7,6 +7,6 @@ whisper_large_v3,,,
|
|
7 |
old_models,,,
|
8 |
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,70.18719211822659,64.12654745529574,55.01831501831502
|
9 |
gemini-1.5-flash,78.06896551724138,65.9697386519945,49.908424908424905
|
10 |
-
WavLLM_fairseq,60.70935960591133,55.
|
11 |
-
SALMONN_7B,55.665024630541865,31.
|
12 |
cascade_whisper_large_v3_llama_3_8b_instruct,67.3103448275862,59.44979367262724,52.252747252747255
|
|
|
7 |
old_models,,,
|
8 |
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,70.18719211822659,64.12654745529574,55.01831501831502
|
9 |
gemini-1.5-flash,78.06896551724138,65.9697386519945,49.908424908424905
|
10 |
+
WavLLM_fairseq,60.70935960591133,55.62585969738653,40.95238095238095
|
11 |
+
SALMONN_7B,55.665024630541865,31.27922971114167,32.124542124542124
|
12 |
cascade_whisper_large_v3_llama_3_8b_instruct,67.3103448275862,59.44979367262724,52.252747252747255
|
results_organized/meteor/audio_captioning.csv
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
Model,audiocaps_test,wavcaps_test
|
2 |
-
Qwen-Audio-Chat,0.
|
3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,0.
|
4 |
hy_whisper_local_cs,,
|
5 |
-
Qwen2-Audio-7B-Instruct,0.
|
6 |
whisper_large_v3,,
|
7 |
old_models,,
|
8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.
|
9 |
gemini-1.5-flash,,
|
10 |
-
WavLLM_fairseq,0.
|
11 |
-
SALMONN_7B,0.
|
12 |
-
cascade_whisper_large_v3_llama_3_8b_instruct,0.
|
|
|
1 |
Model,audiocaps_test,wavcaps_test
|
2 |
+
Qwen-Audio-Chat,0.2755301507695097,0.2355106805560457
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.2492004703435381,0.3175511907248581
|
4 |
hy_whisper_local_cs,,
|
5 |
+
Qwen2-Audio-7B-Instruct,0.1989171207631428,0.2134229485619918
|
6 |
whisper_large_v3,,
|
7 |
old_models,,
|
8 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.0579681972394305,0.120421856260385
|
9 |
gemini-1.5-flash,,
|
10 |
+
WavLLM_fairseq,0.0417329650944285,0.0639952252468867
|
11 |
+
SALMONN_7B,0.2099405248433995,0.1717511277065815
|
12 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.0795304845778549,0.1388630786594543
|
results_organized/wer/asr_english.csv
CHANGED
@@ -1,12 +1,18 @@
|
|
1 |
Model,librispeech_test_clean,librispeech_test_other,common_voice_15_en_test,peoples_speech_test,gigaspeech_test,earnings21_test,earnings22_test,tedlium3_test,tedlium3_long_form_test
|
2 |
-
Qwen-Audio-Chat,0.
|
3 |
-
|
4 |
-
|
5 |
-
Qwen2-Audio-7B-Instruct,0.035141660693401744,0.060415760304159495,0.11438872500819404,0.2165498391593041,0.11723812890302816,0.18872219319407232,0.23542555661330924,0.06114048472375004,0.08739585179932637
|
6 |
-
whisper_large_v3,0.01878749009695552,0.03660128246354058,0.10001863741235596,0.14602420615337386,0.09459022434812692,0.11863959266711877,0.15887899737116104,0.037649480146197796,0.03208650948413402
|
7 |
old_models,,,,,,,,,
|
8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.032349945297468596,0.05307658841999735,0.10600831614192711,0.20140159998943682,0.09948381629977261,0.11416493424197618,0.1448629161356777,0.04900464852205386,0.04396383619925545
|
9 |
gemini-1.5-flash,,,,,,,,,
|
10 |
-
WavLLM_fairseq,0.
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Model,librispeech_test_clean,librispeech_test_other,common_voice_15_en_test,peoples_speech_test,gigaspeech_test,earnings21_test,earnings22_test,tedlium3_test,tedlium3_long_form_test
|
2 |
+
Qwen-Audio-Chat,0.0202587995623797,0.043467569561352,0.1127242112839891,0.3141914474672335,0.1301891002258773,0.2655529121410546,0.3664994875132684,0.0405237571413363,0.2911540507002305
|
3 |
+
Qwen2-Audio-7B-Instruct,0.0351416606934017,0.0604157603041594,0.114388725008194,0.2165498391593041,0.1172381289030281,0.1887221931940723,0.2354255566133092,0.06114048472375,0.0873958517993263
|
4 |
+
whisper_large_v3,0.0187874900969555,0.0366012824635405,0.1000186374123559,0.1460242061533738,0.0945902243481269,0.1186395926671187,0.158878997371161,0.0376494801461977,0.032086509484134
|
|
|
|
|
5 |
old_models,,,,,,,,,
|
|
|
6 |
gemini-1.5-flash,,,,,,,,,
|
7 |
+
WavLLM_fairseq,0.0210321801788206,0.0479883481188643,0.1453332562130063,0.3792176325635977,0.154917784145464,0.6447482518259942,0.6671766188447099,0.0662148255917107,0.4536784258110264
|
8 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.023937073225940318,0.0422569845082944,0.07797507728099434,0.21620323529945748,0.14477210452030514,0.13838923413858656,0.16553574886426656,0.08154430289911642,0.10512320510547775
|
9 |
+
MERaLiON-AudioLLM-v2-2b,0.027124910401026145,0.050958064577146425,0.09270505973611995,0.20627055897299626,0.09237908290276242,0.21886082422652334,0.23935918375209228,0.03456229374401192,0.13837971990781775
|
10 |
+
MERaLiON-AudioLLM-v2-9b,0.02497453502848304,0.046607524542720415,0.08676036786395974,0.20476530792451958,0.09023061553464748,0.1084090226901313,0.15062142184399924,0.03513005216280473,0.043573834426520124
|
11 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.020956728411363035,0.04040327614579984,0.0761563229028091,0.1957668115250735,0.08768103407213536,0.09210848128425476,0.1277414998676963,0.0313686526383024,0.03495834071973054
|
12 |
+
Qwen2.5-Omni-3B,0.01765571358509073,0.03898462178674788,0.08397118270448134,0.2217852079375585,0.09894231227233641,0.12490689375326566,0.18720009894897133,0.03211383556296796,0.052153873426697396
|
13 |
+
Qwen2.5-Omni-7B,0.02252235258610933,0.04165169198176556,0.08635548614726127,0.31617534194121266,0.12679717916513114,0.23232370957521317,0.2807910240306093,0.0633760334977467,0.09094132246055664
|
14 |
+
SALMONN_7B,0.09638963292715132,0.11776722719276675,0.315955552984878,0.24158949229136512,0.11024871580815716,0.27733154717568453,0.37956460424973665,0.039352755402576205,0.14139336996986349
|
15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.03299128532085864,0.05381428868670437,0.10610471655066483,0.20285898669536326,0.09994259054523941,0.14091838890062366,0.17187922953626794,0.04939498243497392,0.08636766530756958
|
16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.018032972422378994,0.035504189759207064,0.09879113887442882,0.14542012514049835,0.09501640807342393,0.10872308256717546,0.1459710229559586,0.038146268762641496,0.04935295160432548
|
17 |
+
hy_whisper_local_cs,0.029086656354925113,0.05591389713810127,0.1066766923091754,0.17879147486544342,0.10212866235970408,0.14925070316060968,0.17014458107377883,0.04666264504453355,0.06973940790639957
|
18 |
+
phi_4_multimodal_instruct,0.016844607084920964,0.03851173700039722,0.08109202383018103,0.2147161396912585,0.0988294989332872,0.1306461295594268,0.22572024408764688,0.028636315247862035,0.05062932104236838
|
results_organized/wer/asr_mandarin.csv
CHANGED
@@ -1,12 +1,18 @@
|
|
1 |
-
Model,aishell_asr_zh_test
|
2 |
-
Qwen-Audio-Chat,0.
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,aishell_asr_zh_test,commonvoice_zh_asr
|
2 |
+
Qwen-Audio-Chat,0.9469917443725128,
|
3 |
+
Qwen2-Audio-7B-Instruct,0.0926035912969452,
|
4 |
+
whisper_large_v3,0.1235968402922135,
|
5 |
+
old_models,,
|
6 |
+
gemini-1.5-flash,,
|
7 |
+
WavLLM_fairseq,0.7054601967888183,
|
8 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.12846706657955692,0.3269799259362027
|
9 |
+
MERaLiON-AudioLLM-v2-2b,0.05010789728969927,0.13139387212789344
|
10 |
+
MERaLiON-AudioLLM-v2-9b,0.05789827958266516,0.14684695260557293
|
11 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.043317297222387204,0.1183419954537208
|
12 |
+
Qwen2.5-Omni-3B,0.08080418126744669,0.08551487145555639
|
13 |
+
Qwen2.5-Omni-7B,0.08943596444338857,0.0775535468448182
|
14 |
+
SALMONN_7B,0.9314703727900854,1.0013340021130595
|
15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.20889509215814378,0.31938144990021666
|
16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.12450753301261111,0.1962263748225777
|
17 |
+
hy_whisper_local_cs,0.15675793391538476,0.287290695068461
|
18 |
+
phi_4_multimodal_instruct,0.12232978955079092,0.154221316286565
|
results_organized/wer/asr_private.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,cna_test,idpc_short_test,idpc_test,mediacorp_short_test,mediacorp_test,parliament_test,ukusnews_test,ytb_asr_batch1,ytb_asr_batch2,ytb_asr_batch3_chinese,ytb_asr_batch3_malay,ytb_asr_batch3_tamil
|
2 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.14503898323187012,0.16498433693003828,0.20359281437125748,0.12828873397796267,0.12250898399215943,0.058780395496262655,0.1128757799205899,0.10724437274333563,0.13268461455292463,0.418102808691044,0.28989513404414025,0.6929759165018962
|
3 |
+
MERaLiON-AudioLLM-v2-2b,0.13494606429563175,0.15106160807518274,0.17741659538066723,0.1208680008994828,0.12250898399215943,0.18544800832623712,0.17383248251087163,0.09933164323576861,0.15990917937074278,0.25613142554319024,0.2798911851169321,0.7504943113675407
|
4 |
+
MERaLiON-AudioLLM-v2-9b,0.13334401367083198,0.15663069961712495,0.16030795551753635,0.11693276366089499,0.10454099967330938,0.06024694862333239,0.06972962752883342,0.09848659445340709,0.1110174072872743,0.19133015368309486,0.20907375718485366,0.6644679264853651
|
5 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.12709601623411299,0.14009745910198398,0.16612489307100087,0.11783224645828648,0.10372427311336165,0.05284322073989971,0.055965210814898844,0.09230237381885227,0.09936209319926478,0.1494223635400106,0.19463823439076827,0.5467894071504975
|
6 |
+
Qwen2.5-Omni-3B,0.15224821104346897,0.3038635572572224,0.19743370402053037,0.13660894985383404,0.1391702058150931,0.09165957044185827,0.0828512006050293,0.12241683951755397,0.24802681370959023,0.2562374138844727,2.2815585099381335,1.2873650773070564
|
7 |
+
Qwen2.5-Omni-7B,0.17280786072839902,0.4491820396797772,0.6198460222412319,0.26714639082527547,0.3391048676902973,0.2558898665909736,0.22628096048402344,0.20300376430821235,0.34827548924208024,0.19881293057763647,1.4799262866921152,1.0804025801432693
|
8 |
+
SALMONN_7B,0.1492577165438428,0.2398190045248869,0.5414884516680923,0.19901056892286936,0.3636883371447239,0.20430031223389156,0.191869918699187,0.2207497887378044,0.3495513028435506,0.8858293587705353,1.0858672282918695,0.985267900554277
|
9 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.15171419416853574,0.19735468151757746,0.17040205303678357,0.1541488644029683,0.15754655341391702,0.09007474690131517,0.12278313480809226,0.12475992932319274,0.12552708400908205,0.3469210386857446,0.3143784827344127,0.9665002755178114
|
10 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.13815016554523124,0.15344926428434932,0.16184773310521813,0.11434675061839443,0.15125775890231952,0.06537988456807645,0.08943089430894309,0.10816624414227549,0.08387933830684398,0.2698675145733969,0.3119213724715897,0.8976532365239376
|
11 |
+
hy_whisper_local_cs,0.14674783723165652,0.18308388444135051,0.17570573139435414,0.12885091072633237,0.1256125449199608,0.07257072570725707,0.16948383437322745,0.1284858262272413,0.14315061087685155,0.27520932697403283,0.2421569917950068,0.8339924151567211
|
12 |
+
phi_4_multimodal_instruct,0.19080422941364947,0.5388096066829099,0.26073567151411464,0.1217674836968743,0.19813786344331918,0.2778645094143249,0.07521270561542824,0.16939386955519706,0.23232781922369986,0.44008479067302597,3.762932736606555,2.7500567242552916
|
results_organized/wer/asr_sea.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,commonvoice_17_id_asr,commonvoice_17_ta_asr,commonvoice_17_vi_asr,fleurs_tamil_ta_30_asr,gigaspeech2_id_test,gigaspeech2_th_test,gigaspeech2_vi_test,lotus_thai_th_30_asr
|
2 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.25954549636581103,0.5284951114826634,0.9221892864704637,0.4624736472241743,0.337184855698226,0.9866395307075302,0.9818897503814326,0.8520208370756243
|
3 |
+
MERaLiON-AudioLLM-v2-2b,0.08547244456711749,0.13853008043879414,0.14196485284776625,0.1432185523541813,0.17842684134623737,0.19968394588770502,0.16825573283269715,0.014873360876594216
|
4 |
+
MERaLiON-AudioLLM-v2-9b,0.11334989419449812,0.15591770571023683,0.15646834639000634,0.16085734364019677,0.1722759890883186,0.20004788698671136,0.11314793912959634,0.018681516076881625
|
5 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.07921611923820039,0.12871226564172622,0.1423883125132331,0.1383345045678145,0.16282383194620612,0.18238237758889023,0.09499798648962901,0.010670019759295851
|
6 |
+
Qwen2.5-Omni-3B,0.13731714049130556,1.0276387288835422,0.2463476603853483,1.3477160927617708,0.3110002953799107,0.4670274152998923,0.19581530154444754,0.4822705227231902
|
7 |
+
Qwen2.5-Omni-7B,0.18235348238108381,1.0684188526512177,0.22041075587550285,1.2090302178496135,0.26146334682814104,0.2936956781994493,0.22408385278119664,0.0984012933357284
|
8 |
+
SALMONN_7B,1.1888858220627472,1.4272941368377052,1.496294727927165,1.507519325368939,2.1181172136986777,1.2470441757452413,1.5460526688938172,1.1351535836177475
|
9 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.09977918851780293,0.23805397249380653,0.1567859411391065,0.2724525650035137,0.2191718937327333,0.276058900993655,0.17136958408249153,0.06815160768816239
|
10 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.07815806421933941,0.24404355317218387,0.11676900275248782,0.28397751229796203,0.1926224523482703,0.20872022028013887,0.15538061017872032,0.031794503323154304
|
11 |
+
hy_whisper_local_cs,0.10267733922163952,0.31793713743921215,0.1681134871903451,0.33113141250878425,0.21382030476256667,0.26486292350053875,0.1781020821398794,0.076019400035926
|
12 |
+
phi_4_multimodal_instruct,1.327169012788665,1.1784589191228196,1.1070294304467498,1.7016514406184118,5.803850364012302,1.7344522925894887,2.5042567310800923,1.2856834920064666
|
results_organized/wer/asr_singlish.csv
CHANGED
@@ -1,12 +1,18 @@
|
|
1 |
Model,imda_part1_asr_test,imda_part2_asr_test,imda_part3_30s_asr_test,imda_part4_30s_asr_test,imda_part5_30s_asr_test,imda_part6_30s_asr_test
|
2 |
-
Qwen-Audio-Chat,0.
|
3 |
-
|
4 |
-
|
5 |
-
Qwen2-Audio-7B-Instruct,0.07197717796796138,0.1905689473257041,0.35076166942732234,0.5613424034000176,0.27856006770658537,0.2245352799625317
|
6 |
-
whisper_large_v3,0.06844171360300393,0.3171008846684522,0.27026366524560785,0.4618189591218298,0.2143555471246589,0.1698509342851144
|
7 |
old_models,,,,,,
|
8 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.07041669714480775,0.32988393799204613,0.3035544573275043,0.4779640131272869,0.22881615619208825,0.1789273082575623
|
9 |
gemini-1.5-flash,,,,,,
|
10 |
-
WavLLM_fairseq,0.
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Model,imda_part1_asr_test,imda_part2_asr_test,imda_part3_30s_asr_test,imda_part4_30s_asr_test,imda_part5_30s_asr_test,imda_part6_30s_asr_test
|
2 |
+
Qwen-Audio-Chat,0.1055031331529027,0.4547926304683061,0.6412550574306894,1.173131813552289,0.3016882870525747,0.3139424086306303
|
3 |
+
Qwen2-Audio-7B-Instruct,0.0719771779679613,0.1905689473257041,0.3507616694273223,0.5613424034000176,0.2785600677065853,0.2245352799625317
|
4 |
+
whisper_large_v3,0.0684417136030039,0.3171008846684522,0.2702636652456078,0.4618189591218298,0.2143555471246589,0.1698509342851144
|
|
|
|
|
5 |
old_models,,,,,,
|
|
|
6 |
gemini-1.5-flash,,,,,,
|
7 |
+
WavLLM_fairseq,0.1007729256577182,0.4463923382842302,0.7540934640345399,1.143645714142011,0.3979658840524726,0.4254106170965293
|
8 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.04303513520103382,0.0473581689797906,0.21299589974746788,0.29660878421707804,0.15406166552363165,0.1087388362215152
|
9 |
+
MERaLiON-AudioLLM-v2-2b,0.049057615877892376,0.05819332846359873,0.26414044043772233,0.3595795244502006,0.20202536078562985,0.1493725673864242
|
10 |
+
MERaLiON-AudioLLM-v2-9b,0.051959134908443665,0.14532099667234802,0.22654574089662477,0.2948987161915779,0.16760298259181977,0.12655243140231592
|
11 |
+
MERaLiON-AudioLLM-v2-9b-asr,0.04362031550971643,0.054094635175716256,0.19622831075026476,0.24570911239925058,0.1403598371539887,0.0989680065892537
|
12 |
+
Qwen2.5-Omni-3B,0.04657059956599127,0.11265319373427482,0.49541097564287073,1.0728162054093475,0.273861464154908,0.17795830036014793
|
13 |
+
Qwen2.5-Omni-7B,0.04854558310779509,0.12052593133674215,0.6256143590300595,1.1316375158747123,0.34107192365498823,0.36374941455772863
|
14 |
+
SALMONN_7B,0.09275107892619414,0.45783621459297136,0.681280039101746,0.7865181254636674,0.37533379054734356,0.25522053004731987
|
15 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.07053860970911661,0.3298433568703839,0.2810437993863198,0.4594298934979693,0.21829536997854984,0.17514817745764627
|
16 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.06922195401458074,0.31912994075156237,0.2770250088250468,0.4581096203900464,0.21391778902978215,0.1722411537654032
|
17 |
+
hy_whisper_local_cs,0.06692999780557385,0.2735167600032465,0.25580416542210876,0.3612895924757007,0.186411988735025,0.14417222500363377
|
18 |
+
phi_4_multimodal_instruct,0.057615877892375586,0.3451018586153721,0.4381839411301491,1.4697028756805695,0.23859275364433613,0.1439784234241509
|
results_organized/wer/under_development_wer.csv
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
-
Model,
|
2 |
-
Qwen-Audio-Chat,0.
|
3 |
-
MERaLiON-AudioLLM-Whisper-SEA-LION,0.
|
4 |
-
hy_whisper_local_cs,0.
|
5 |
-
Qwen2-Audio-7B-Instruct,0.
|
6 |
-
whisper_large_v3,0.
|
7 |
-
whisper_large_v2
|
8 |
-
old_models
|
9 |
-
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.
|
10 |
-
gemini-1.5-flash
|
11 |
-
WavLLM_fairseq,
|
12 |
-
SALMONN_7B,
|
13 |
-
cascade_whisper_large_v3_llama_3_8b_instruct,0.
|
14 |
-
Phi4-Multimodal-Instruct
|
|
|
1 |
+
Model,seame_dev_man,seame_dev_sge,ytb_asr_batch3_ms_ms_prompt,ytb_asr_batch3_zh_zh_prompt
|
2 |
+
Qwen-Audio-Chat,0.8783373786407767,1.05567969634822,2.899079022421131,
|
3 |
+
MERaLiON-AudioLLM-Whisper-SEA-LION,0.388282092772384,0.3555052190149683,0.3031898556447721,0.2826921038685744
|
4 |
+
hy_whisper_local_cs,0.3134101941747573,0.3319966941136857,,
|
5 |
+
Qwen2-Audio-7B-Instruct,0.5522518878101402,0.5486546879304539,0.9981132903339036,
|
6 |
+
whisper_large_v3,0.7225930420711975,0.5377268970583734,0.237374402,0.2127821939586645
|
7 |
+
whisper_large_v2,,,,0.2802967673555909
|
8 |
+
old_models,,,,
|
9 |
+
cascade_whisper_large_v2_gemma2_9b_cpt_sea_lionv3_instruct,0.7824973031283711,0.5840399155162387,,
|
10 |
+
gemini-1.5-flash,0.9690871089536138,1.110043160182436,,
|
11 |
+
WavLLM_fairseq,1.2913969795037756,1.2204842511249197,,
|
12 |
+
SALMONN_7B,1.2721817691477886,1.0189782362484312,,
|
13 |
+
cascade_whisper_large_v3_llama_3_8b_instruct,0.6848705501618123,0.507882090054792,,
|
14 |
+
Phi4-Multimodal-Instruct,,,,0.2153471118177
|