Spaces:

Cognitive-Lab
/

indic_llm_leaderboard

Running

File size: 7,155 Bytes

import os
import re
import streamlit as st
import requests
import pandas as pd
from io import StringIO
import plotly.graph_objs as go
from huggingface_hub import HfApi
from huggingface_hub.utils import RepositoryNotFoundError, RevisionNotFoundError
from dotenv import load_dotenv

load_dotenv()

SERVER_URL = os.getenv("SERVER_URL")

def get_data():
    response = requests.get(SERVER_URL)
    data = response.json()
    return data

def main():
    
    st.set_page_config(page_title="Indic LLM Leaderboard", layout="wide")
    
    title_column, refresh_column = st.columns([.92, 0.08])
    with title_column:
        st.title("Indic LLM Leaderboard (α)")
        st.markdown("The Indic Eval Leaderboard utilizes the [indic_eval](https://github.com/adithya-s-k/indic_eval) evaluation framework , incorporating SOTA translated benchmarks like ARC, Hellaswag, MMLU, among others. Supporting 7 Indic languages, it offers a comprehensive platform for assessing model performance and comparing results within the Indic language modeling landscape.")
    with refresh_column:
        if st.button("Refresh", type="primary"):
            data = get_data()
    
    Leaderboard_tab, About_tab ,FAQ_tab, Submit_tab = st.tabs(["🏅 Leaderboard", "📝 About" , "❗FAQ","🚀 Submit"])
    
    with Leaderboard_tab:
        data = get_data()
        
        table_data = []
        all_models = []
        
        for item in data:
            model_name = item.get("name")
            language = item.get("language")
            try:
                ALL = item["result"]["all"]["acc_norm"]
            except KeyError:
                ALL = None
            try:
                ARC_Easy = item["result"]["ARC-Easy"]["acc_norm"]
            except KeyError:
                ARC_Easy = None
            try:
                ARC_Challenge = item["result"]["ARC-Challenge"]["acc_norm"]
            except KeyError:
                ARC_Challenge = None
            try:
                Hellaswag = item["result"]["Hellaswag"]["acc_norm"]
            except KeyError:
                Hellaswag = None
            try:
                Boolq = item["result"]["Boolq"]["acc_norm"]
            except KeyError:
                Boolq = None
            try:
                MMLU = item["result"]["MMLU"]["acc_norm"]
            except KeyError:
                MMLU = None
            try:
                Winograde = item["result"]["Winograde"]["acc_norm"]
            except KeyError:
                Winograde = None
            try:
                Translation = item["result"]["Translation"]["acc_norm"]
            except KeyError:
                Translation = None
            try:
                Generation = item["result"]["Generation"]["acc_norm"]
            except KeyError:
                Generation = None
            
            all_models.append(model_name)
            table_data.append({
                "Model Name": model_name,
                "Language": language,
                "Avergae": ALL,
                "ARC-Easy": ARC_Easy,
                "ARC-Challenge": ARC_Challenge,
                "Hellaswag": Hellaswag,
                "Boolq": Boolq,
                "MMLU": MMLU,
                "Winograde": Winograde,
                "Translation": Translation,
                "Generation": Generation
            })

        df = pd.DataFrame(table_data)
        
        title = st.text_input('Model Name', placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...")
        
        col1, col2 = st.columns(2)
        with col1:
            benchmark_options = st.multiselect(
                'Pick Benchmark',
                ['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU','Winogrande','Translation','Generation'],['ARC-Easy', 'ARC-Challenge', 'Hellaswag', 'Boolq','MMLU'])
        with col2:
            language_options = st.multiselect(
                'Pick Languages',
                ['kannada', 'hindi', 'tamil', 'telegu','gujarathi','marathi','malayalam'],['kannada', 'hindi', 'tamil', 'telegu','gujarathi','marathi','malayalam'])

        if title:
            if ';' in title:
                model_names = [name.strip() for name in title.split(';')]
                filtered_df = df[df['Model Name'].isin(model_names)]
            else:
                filtered_df = df[df['Model Name'].str.contains(title, case=False, na=False)]
            
            filtered_df = filtered_df[filtered_df['Language'].isin(language_options)]
            filtered_df = filtered_df[df.columns.intersection(['Model Name', 'Language'] + benchmark_options)]

            # Calculate average across selected benchmark columns
            filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)

            # Display the filtered DataFrame
            st.dataframe(filtered_df, use_container_width=True)
        elif benchmark_options or language_options:
            filtered_df = df[df['Language'].isin(language_options)]
            filtered_df = filtered_df[df.columns.intersection(['Model Name', 'Language'] + benchmark_options)]

            # Calculate average across selected benchmark columns
            filtered_df['Average'] = filtered_df[benchmark_options].mean(axis=1)

            st.dataframe(filtered_df, use_container_width=True)

        # Multiselect for comparing models
        compare_models = st.multiselect(
            'Pick Models to compare them',
            df['Model Name'].unique()
        )

        # Display DataFrame for selected models and their scores
        if compare_models:
            compare_data = []
            for model in compare_models:
                model_data = df[df['Model Name'] == model]
                compare_data.append(model_data)
            if compare_data:
                compare_df = pd.concat(compare_data)
                compare_df['Average'] = compare_df[benchmark_options].mean(axis=1) # Calculate average
                st.dataframe(compare_df, use_container_width=True)
                


    # About tab
    with About_tab:
        st.markdown('''
            ### About Indic LLM Leaderboard
            
            ### Indic Eval
            
            ### Contribute
        ''')
        
    # FAQ tab
    with FAQ_tab:
        st.markdown('''
            ### FAQ 
                        
            ### SUBMISSIONS
            
            
            ### RESULTS
            
            
            ### EDITING SUBMISSIONS
            
            
            ### OTHER
        ''')

    # Submit tab
    with Submit_tab:
        st.markdown('''
            ### Submit Your Model
        ''')

        
    with st.expander(label="📙 Citation"):
        code = '''
                    @misc{indic-llm-leaderboard,
            author = {Adithya S Kolavi},
            title = {Indic LLM Leaderboard},
            year = {2024},
            publisher = {Cognitivelab},
            howpublished = "url{https://huggingface.co/spaces/Cognitive-Lab/indic_llm_leaderboard}",
            }
        '''
        st.code(code, language='python')
        
if __name__ == "__main__":
    main()