File size: 7,155 Bytes
0822dde
09820cd
 
 
 
 
 
 
 
0822dde
09820cd
0822dde
 
 
 
 
 
 
 
09820cd
 
0822dde
09820cd
0822dde
09820cd
 
0822dde
 
09820cd
0822dde
 
09820cd
 
 
 
0822dde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09820cd
 
 
 
 
 
 
 
 
 
0822dde
 
 
 
 
 
 
 
09820cd
0822dde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09820cd
0822dde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09820cd
 
 
 
 
 
 
 
 
 
 
 
 
 
0822dde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09820cd
 
0822dde
 
 
09820cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
import re
import streamlit as st
import requests
import pandas as pd
from io import StringIO
import plotly.graph_objs as go
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
from dotenv import load_dotenv

load_dotenv()

SERVER_URL = os.getenv("SERVER_URL")

def get_data():
    response = requests.get(SERVER_URL)
    data = response.json()
    return data

def main():
    
    st.set_page_config(page_title="Indic LLM Leaderboard", layout="wide")
    
    title_column, refresh_column = st.columns([.92, 0.08])
    with title_column:
        st.title("Indic LLM Leaderboard (Ξ±)")
        st.markdown("The Indic Eval Leaderboard utilizes the [indic_eval](https://github.com/adithya-s-k/indic_eval) evaluation framework , incorporating SOTA translated benchmarks like ARC, Hellaswag, MMLU, among others. Supporting 7 Indic languages, it offers a comprehensive platform for assessing model performance and comparing results within the Indic language modeling landscape.")
    with refresh_column:
        if st.button("Refresh", type="primary"):
            data = get_data()
    
    Leaderboard_tab, About_tab ,FAQ_tab, Submit_tab = st.tabs(["πŸ… Leaderboard", "πŸ“ About" , "❗FAQ","πŸš€ Submit"])
    
    with Leaderboard_tab:
        data = get_data()
        
        table_data = []
        all_models = []
        
        for item in data:
            model_name = item.get("name")
            language = item.get("language")
            try:
                ALL = item["result"]["all"]["acc_norm"]
            except KeyError:
                ALL = None
            try:
                ARC_Easy = item["result"]["ARC-Easy"]["acc_norm"]
            except KeyError:
                ARC_Easy = None
            try:
                ARC_Challenge = item["result"]["ARC-Challenge"]["acc_norm"]
            except KeyError:
                ARC_Challenge = None
            try:
                Hellaswag = item["result"]["Hellaswag"]["acc_norm"]
            except KeyError:
                Hellaswag = None
            try:
                Boolq = item["result"]["Boolq"]["acc_norm"]
            except KeyError:
                Boolq = None
            try:
                MMLU = item["result"]["MMLU"]["acc_norm"]
            except KeyError:
                MMLU = None
            try:
                Winograde = item["result"]["Winograde"]["acc_norm"]
            except KeyError:
                Winograde = None
            try:
                Translation = item["result"]["Translation"]["acc_norm"]
            except KeyError:
                Translation = None
            try:
                Generation = item["result"]["Generation"]["acc_norm"]
            except KeyError:
                Generation = None
            
            all_models.append(model_name)
            table_data.append({
                "Model Name": model_name,
                "Language": language,
                "Avergae": ALL,
                "ARC-Easy": ARC_Easy,
                "ARC-Challenge": ARC_Challenge,
                "Hellaswag": Hellaswag,
                "Boolq": Boolq,
                "MMLU": MMLU,
                "Winograde": Winograde,
                "Translation": Translation,
                "Generation": Generation
            })

        df = pd.DataFrame(table_data)
        
        title = st.text_input('Model Name', placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...")
        
        col1, col2 = st.columns(2)
        with col1:
            benchmark_options = st.multiselect(
                'Pick Benchmark',
                ['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU','Winogrande','Translation','Generation'],['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU'])
        with col2:
            language_options = st.multiselect(
                'Pick Languages',
                ['kannada', 'hindi', 'tamil', 'telegu','gujarathi','marathi','malayalam'],['kannada', 'hindi', 'tamil', 'telegu','gujarathi','marathi','malayalam'])

        if title:
            if ';' in title:
                model_names = [name.strip() for name in title.split(';')]
                filtered_df = df[df['Model Name'].isin(model_names)]
            else:
                filtered_df = df[df['Model Name'].str.contains(title, case=False, na=False)]
            
            filtered_df = filtered_df[filtered_df['Language'].isin(language_options)]
            filtered_df = filtered_df[df.columns.intersection(['Model Name', 'Language'] + benchmark_options)]

            # Calculate average across selected benchmark columns
            filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)

            # Display the filtered DataFrame
            st.dataframe(filtered_df, use_container_width=True)
        elif benchmark_options or language_options:
            filtered_df = df[df['Language'].isin(language_options)]
            filtered_df = filtered_df[df.columns.intersection(['Model Name', 'Language'] + benchmark_options)]

            # Calculate average across selected benchmark columns
            filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)

            st.dataframe(filtered_df, use_container_width=True)

        # Multiselect for comparing models
        compare_models = st.multiselect(
            'Pick Models to compare them',
            df['Model Name'].unique()
        )

        # Display DataFrame for selected models and their scores
        if compare_models:
            compare_data = []
            for model in compare_models:
                model_data = df[df['Model Name'] == model]
                compare_data.append(model_data)
            if compare_data:
                compare_df = pd.concat(compare_data)
                compare_df['Average'] = compare_df[benchmark_options].mean(axis=1) # Calculate average
                st.dataframe(compare_df, use_container_width=True)
                


    # About tab
    with About_tab:
        st.markdown('''
            ### About Indic LLM Leaderboard
            
            ### Indic Eval
            
            ### Contribute
        ''')
        
    # FAQ tab
    with FAQ_tab:
        st.markdown('''
            ### FAQ 
                        
            ### SUBMISSIONS
            
            
            ### RESULTS
            
            
            ### EDITING SUBMISSIONS
            
            
            ### OTHER
        ''')

    # Submit tab
    with Submit_tab:
        st.markdown('''
            ### Submit Your Model
        ''')

        
    with st.expander(label="πŸ“™ Citation"):
        code = '''
                    @misc{indic-llm-leaderboard,
            author = {Adithya S Kolavi},
            title = {Indic LLM Leaderboard},
            year = {2024},
            publisher = {Cognitivelab},
            howpublished = "url{https://huggingface.co/spaces/Cognitive-Lab/indic_llm_leaderboard}",
            }
        '''
        st.code(code, language='python')
        
if __name__ == "__main__":
    main()