File size: 8,113 Bytes
aa37927
5eeb4d8
aa37927
863e0ef
aa37927
5eeb4d8
aa37927
 
5eeb4d8
aa37927
36c174a
0b82379
 
 
 
 
 
 
 
 
 
 
 
 
aa37927
cee99cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36c174a
0b82379
863e0ef
 
 
46816dc
 
df56132
46816dc
 
863e0ef
 
46816dc
 
863e0ef
df56132
 
 
 
 
 
 
 
 
863e0ef
 
df56132
863e0ef
df56132
863e0ef
df56132
863e0ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46816dc
3e0f1db
46816dc
3e0f1db
 
 
863e0ef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from enum import Enum
from dataclasses import dataclass

# Define TaskInfo and Tasks as before
@dataclass
class TaskInfo:
    benchmark: str
    col_name: str
    metric: str


# src/about.py

from enum import Enum
from dataclasses import dataclass

# Define TaskInfo dataclass
@dataclass
class TaskInfo:
    benchmark: str
    col_name: str
    metric: str

# Define Tasks enum with your specific subjects, excluding the unwanted ones
class Tasks(Enum):
    Professional_Law = TaskInfo(benchmark='professional_law', col_name='Professional Law', metric='accuracy')
    Moral_Scenarios = TaskInfo(benchmark='moral_scenarios', col_name='Moral Scenarios', metric='accuracy')
    Miscellaneous = TaskInfo(benchmark='miscellaneous', col_name='Miscellaneous', metric='accuracy')
    High_School_Psychology = TaskInfo(benchmark='high_school_psychology', col_name='High School Psychology', metric='accuracy')
    High_School_Macroeconomics = TaskInfo(benchmark='high_school_macroeconomics', col_name='High School Macroeconomics', metric='accuracy')
    Elementary_Mathematics = TaskInfo(benchmark='elementary_mathematics', col_name='Elementary Mathematics', metric='accuracy')
    Prehistory = TaskInfo(benchmark='prehistory', col_name='Prehistory', metric='accuracy')
    Philosophy = TaskInfo(benchmark='philosophy', col_name='Philosophy', metric='accuracy')
    High_School_Biology = TaskInfo(benchmark='high_school_biology', col_name='High School Biology', metric='accuracy')
    Nutrition = TaskInfo(benchmark='nutrition', col_name='Nutrition', metric='accuracy')
    Professional_Accounting = TaskInfo(benchmark='professional_accounting', col_name='Professional Accounting', metric='accuracy')
    Professional_Medicine = TaskInfo(benchmark='professional_medicine', col_name='Professional Medicine', metric='accuracy')
    High_School_Mathematics = TaskInfo(benchmark='high_school_mathematics', col_name='High School Mathematics', metric='accuracy')
    Clinical_Knowledge = TaskInfo(benchmark='clinical_knowledge', col_name='Clinical Knowledge', metric='accuracy')
    Security_Studies = TaskInfo(benchmark='security_studies', col_name='Security Studies', metric='accuracy')
    High_School_Microeconomics = TaskInfo(benchmark='high_school_microeconomics', col_name='High School Microeconomics', metric='accuracy')
    High_School_World_History = TaskInfo(benchmark='high_school_world_history', col_name='High School World History', metric='accuracy')
    Conceptual_Physics = TaskInfo(benchmark='conceptual_physics', col_name='Conceptual Physics', metric='accuracy')
    Marketing = TaskInfo(benchmark='marketing', col_name='Marketing', metric='accuracy')
    High_School_Statistics = TaskInfo(benchmark='high_school_statistics', col_name='High School Statistics', metric='accuracy')
    High_School_US_History = TaskInfo(benchmark='high_school_us_history', col_name='High School US History', metric='accuracy')
    High_School_Chemistry = TaskInfo(benchmark='high_school_chemistry', col_name='High School Chemistry', metric='accuracy')
    Sociology = TaskInfo(benchmark='sociology', col_name='Sociology', metric='accuracy')
    High_School_Geography = TaskInfo(benchmark='high_school_geography', col_name='High School Geography', metric='accuracy')
    High_School_Government_and_Politics = TaskInfo(benchmark='high_school_government_and_politics', col_name='High School Government and Politics', metric='accuracy')
    College_Medicine = TaskInfo(benchmark='college_medicine', col_name='College Medicine', metric='accuracy')
    Virology = TaskInfo(benchmark='virology', col_name='Virology', metric='accuracy')
    High_School_European_History = TaskInfo(benchmark='high_school_european_history', col_name='High School European History', metric='accuracy')
    Logical_Fallacies = TaskInfo(benchmark='logical_fallacies', col_name='Logical Fallacies', metric='accuracy')
    Astronomy = TaskInfo(benchmark='astronomy', col_name='Astronomy', metric='accuracy')
    High_School_Physics = TaskInfo(benchmark='high_school_physics', col_name='High School Physics', metric='accuracy')
    Electrical_Engineering = TaskInfo(benchmark='electrical_engineering', col_name='Electrical Engineering', metric='accuracy')
    College_Biology = TaskInfo(benchmark='college_biology', col_name='College Biology', metric='accuracy')
    Anatomy = TaskInfo(benchmark='anatomy', col_name='Anatomy', metric='accuracy')
    Formal_Logic = TaskInfo(benchmark='formal_logic', col_name='Formal Logic', metric='accuracy')
    International_Law = TaskInfo(benchmark='international_law', col_name='International Law', metric='accuracy')
    Econometrics = TaskInfo(benchmark='econometrics', col_name='Econometrics', metric='accuracy')
    Machine_Learning = TaskInfo(benchmark='machine_learning', col_name='Machine Learning', metric='accuracy')
    Management = TaskInfo(benchmark='management', col_name='Management', metric='accuracy')
    College_Physics = TaskInfo(benchmark='college_physics', col_name='College Physics', metric='accuracy')
    US_Foreign_Policy = TaskInfo(benchmark='us_foreign_policy', col_name='US Foreign Policy', metric='accuracy')
    Business_Ethics = TaskInfo(benchmark='business_ethics', col_name='Business Ethics', metric='accuracy')
    College_Mathematics = TaskInfo(benchmark='college_mathematics', col_name='College Mathematics', metric='accuracy')
    College_Chemistry = TaskInfo(benchmark='college_chemistry', col_name='College Chemistry', metric='accuracy')
    College_Computer_Science = TaskInfo(benchmark='college_computer_science', col_name='College Computer Science', metric='accuracy')
    High_School_Computer_Science = TaskInfo(benchmark='high_school_computer_science', col_name='High School Computer Science', metric='accuracy')
    Computer_Security = TaskInfo(benchmark='computer_security', col_name='Computer Security', metric='accuracy')
    Global_Facts = TaskInfo(benchmark='global_facts', col_name='Global Facts', metric='accuracy')
    Medical_Genetics = TaskInfo(benchmark='medical_genetics', col_name='Medical Genetics', metric='accuracy')
    Abstract_Algebra = TaskInfo(benchmark='abstract_algebra', col_name='Abstract Algebra', metric='accuracy')


# Now include the variables expected by app.py

TITLE = """
<div align="center">
    <a href="https://imgbb.com/">
        <img src="https://i.ibb.co/zHzhXqb/Smile-Group-Logo.png" alt="Smile-Group-Logo" border="0" width="800" height="auto">
    </a>
</div>
"""



INTRODUCTION_TEXT = """
<div style="background-color:#001f3f; padding: 20px; border-radius: 10px;">
    <h1 style="color:#ffffff; font-family: Arial, sans-serif; text-align: center;">
        Welcome to <span style="color:#f39c12;">BASMA</span>: Benchmark for Arabic System in Multitask Assessment
    </h1>
    <p style="color:#d4d4d4; font-family: 'Verdana', sans-serif; font-size: 18px; text-align: center;">
        This leaderboard showcases the performance of various Arabic large language models on the 
        <strong>new released MMMLU OpenAI dataset</strong> across different subjects.
    </p>
</div>
"""


LLM_BENCHMARKS_TEXT = """
## About BASMA

BASMA is based on The Massive Multitask Multilingual Language Understanding (MMMLU) benchmark which is designed to evaluate Arabic models on a wide range of subjects.

## How to Interpret the Leaderboard

- **Model**: The name of the model evaluated.
- **Average ⬆️**: The average accuracy across all subjects.
- **Subject Columns**: The accuracy (%) for each individual subject.

## How to Submit Your Model

Go to the **Submit here!** tab and provide your model details to have it evaluated and appear on the leaderboard.
"""

EVALUATION_QUEUE_TEXT = """
Below are the lists of models that have been evaluated, are currently being evaluated, or are pending evaluation.
"""

CITATION_BUTTON_LABEL = "Citation"
CITATION_BUTTON_TEXT = """
If you use this leaderboard or the MMMLU dataset in your research, please cite:
@misc{BASMA,
  author = {Nacar, Omer},
  title = {BASMA: Benchmark for Arabic System in Multitask Assessment},
  year = {2024},
  publisher = {Omartificial-Intelligence-Space}}"
}"""