Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
fe19b0b
1
Parent(s):
da222cf
Enhance evaluation sections and streamline JSON result loading for retrieval and reranking
Browse files
app.py
CHANGED
@@ -8,22 +8,66 @@ HEADER = """<div style="text-align: center; margin-bottom: 20px;">
|
|
8 |
</div>
|
9 |
"""
|
10 |
|
11 |
-
|
12 |
-
## About
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
-
|
|
|
15 |
|
16 |
-
|
17 |
|
18 |
-
|
|
|
|
|
|
|
19 |
|
20 |
-
|
21 |
|
22 |
-
|
|
|
|
|
|
|
23 |
|
24 |
-
###
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
-
|
|
|
|
|
27 |
"""
|
28 |
|
29 |
CITATION_BUTTON_LABEL = """
|
@@ -31,7 +75,7 @@ Copy the following snippet to cite these results
|
|
31 |
"""
|
32 |
|
33 |
CITATION_BUTTON_TEXT = """
|
34 |
-
@misc{
|
35 |
author = {Mohaned A. Rashad, Hamza Shahid},
|
36 |
title = {The Arabic RAG Leaderboard},
|
37 |
year = {2025},
|
@@ -119,7 +163,7 @@ def main():
|
|
119 |
submit_gradio_module("Retriever")
|
120 |
|
121 |
with gr.Tab("ℹ️ About"):
|
122 |
-
gr.Markdown(
|
123 |
|
124 |
with gr.Tab("📊 Reranking"):
|
125 |
with gr.Tabs():
|
@@ -161,7 +205,7 @@ def main():
|
|
161 |
submit_gradio_module("Reranker")
|
162 |
|
163 |
with gr.Tab("ℹ️ About"):
|
164 |
-
gr.Markdown(
|
165 |
|
166 |
# with gr.Tab("🧠 LLM Context Answering"):
|
167 |
# with gr.Tabs():
|
|
|
8 |
</div>
|
9 |
"""
|
10 |
|
11 |
+
RETRIEVAL_ABOUT_SECTION = """
|
12 |
+
## About Retrieval Evaluation
|
13 |
+
|
14 |
+
The retrieval evaluation assesses a model's ability to find and retrieve relevant information from a large corpus of Arabic text. Models are evaluated on:
|
15 |
+
|
16 |
+
### Web Search Dataset Metrics
|
17 |
+
- **MRR (Mean Reciprocal Rank)**: Measures the ranking quality by focusing on the position of the first relevant result
|
18 |
+
- **nDCG (Normalized Discounted Cumulative Gain)**: Evaluates the ranking quality considering all relevant results
|
19 |
+
- **Recall@5**: Measures the proportion of relevant documents found in the top 5 results
|
20 |
+
- **Overall Score**: Combined score calculated as the average of MRR, nDCG, and Recall@5
|
21 |
+
|
22 |
+
### Model Requirements
|
23 |
+
- Must support Arabic text embeddings
|
24 |
+
- Should handle queries of at least 512 tokens
|
25 |
+
- Must work with `sentence-transformers` library
|
26 |
+
|
27 |
+
### Evaluation Process
|
28 |
+
1. Models process Arabic web search queries
|
29 |
+
2. Retrieved documents are evaluated using:
|
30 |
+
- MRR for first relevant result positioning
|
31 |
+
- nDCG for overall ranking quality
|
32 |
+
- Recall@5 for top results accuracy
|
33 |
+
3. Metrics are averaged to calculate the overall score
|
34 |
+
4. Models are ranked based on their overall performance
|
35 |
+
|
36 |
+
### How to Prepare Your Model
|
37 |
+
- Ensure your model is publicly available on HuggingFace Hub (We don't support private model evaluations yet)
|
38 |
+
- Model should output fixed-dimension embeddings for text
|
39 |
+
- Support batch processing for efficient evaluation (this is default if you use `sentence-transformers`)
|
40 |
+
"""
|
41 |
|
42 |
+
RERANKER_ABOUT_SECTION = """
|
43 |
+
## About Reranking Evaluation
|
44 |
|
45 |
+
The reranking evaluation assesses a model's ability to improve search quality by reordering initially retrieved results. Models are evaluated across multiple unseen Arabic datasets to ensure robust performance.
|
46 |
|
47 |
+
### Evaluation Metrics
|
48 |
+
- **MRR@10 (Mean Reciprocal Rank at 10)**: Measures the ranking quality focusing on the first relevant result in top-10
|
49 |
+
- **NDCG@10 (Normalized DCG at 10)**: Evaluates the ranking quality of all relevant results in top-10
|
50 |
+
- **MAP (Mean Average Precision)**: Measures the overall precision across all relevant documents
|
51 |
|
52 |
+
All metrics are averaged across multiple evaluation datasets to provide a comprehensive assessment of model performance.
|
53 |
|
54 |
+
### Model Requirements
|
55 |
+
- Must accept query-document pairs as input
|
56 |
+
- Should output relevance scores for reranking (has cross-attention or similar mechanism for query-document matching)
|
57 |
+
- Support for Arabic text processing
|
58 |
|
59 |
+
### Evaluation Process
|
60 |
+
1. Models are tested on multiple unseen Arabic datasets
|
61 |
+
2. For each dataset:
|
62 |
+
- Initial candidate documents are provided
|
63 |
+
- Model reranks the candidates
|
64 |
+
- MRR@10, NDCG@10, and MAP are calculated
|
65 |
+
3. Final scores are averaged across all datasets
|
66 |
+
4. Models are ranked based on overall performance
|
67 |
|
68 |
+
### How to Prepare Your Model
|
69 |
+
- Model should be public on HuggingFace Hub (private models are not supported yet)
|
70 |
+
- Make sure it works coherently with `sentence-transformers` library
|
71 |
"""
|
72 |
|
73 |
CITATION_BUTTON_LABEL = """
|
|
|
75 |
"""
|
76 |
|
77 |
CITATION_BUTTON_TEXT = """
|
78 |
+
@misc{TARL,
|
79 |
author = {Mohaned A. Rashad, Hamza Shahid},
|
80 |
title = {The Arabic RAG Leaderboard},
|
81 |
year = {2025},
|
|
|
163 |
submit_gradio_module("Retriever")
|
164 |
|
165 |
with gr.Tab("ℹ️ About"):
|
166 |
+
gr.Markdown(RETRIEVAL_ABOUT_SECTION)
|
167 |
|
168 |
with gr.Tab("📊 Reranking"):
|
169 |
with gr.Tabs():
|
|
|
205 |
submit_gradio_module("Reranker")
|
206 |
|
207 |
with gr.Tab("ℹ️ About"):
|
208 |
+
gr.Markdown(RERANKER_ABOUT_SECTION)
|
209 |
|
210 |
# with gr.Tab("🧠 LLM Context Answering"):
|
211 |
# with gr.Tabs():
|
utils.py
CHANGED
@@ -12,35 +12,40 @@ DATASET_REPO_ID = f"{OWNER}/requests-dataset"
|
|
12 |
|
13 |
results_dir = Path(__file__).parent / "results"
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
dataframe_path = results_dir / "retrieval_results.json"
|
18 |
-
if dataframe_path.exists():
|
19 |
-
df = pd.read_json(dataframe_path)
|
20 |
-
else:
|
21 |
-
raise FileNotFoundError(f"File '{dataframe_path}' not found.")
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
if prepare_for_display:
|
24 |
-
|
25 |
-
df
|
26 |
-
|
27 |
-
|
|
|
|
|
28 |
return df
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
def load_reranking_results(prepare_for_display=False):
|
31 |
-
# Load the reranking results
|
32 |
dataframe_path = results_dir / "reranking_results.json"
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
if prepare_for_display:
|
39 |
-
df[["Model"]] = df[["Model"]].map(lambda x: f'<a href="https://huggingface.co/{x}" target="_blank">{x}</a>')
|
40 |
-
df.sort_values("Overall Score", ascending=False, inplace=True)
|
41 |
-
|
42 |
-
return df
|
43 |
-
|
44 |
|
45 |
def get_model_info(model_id, verbose=False):
|
46 |
model_info = api.model_info(model_id)
|
@@ -148,13 +153,12 @@ def submit_model(model_name, revision, precision, params, license, task):
|
|
148 |
|
149 |
# Upload the submission to the dataset repository
|
150 |
try:
|
151 |
-
hf_api_token = os.environ.get('HF_TOKEN', None)
|
152 |
api.upload_file(
|
153 |
path_or_fileobj=submission_json.encode('utf-8'),
|
154 |
path_in_repo=file_path_in_repo,
|
155 |
repo_id=DATASET_REPO_ID,
|
156 |
repo_type="dataset",
|
157 |
-
token=
|
158 |
)
|
159 |
except Exception as e:
|
160 |
print(f"Error uploading file: {e}")
|
@@ -167,14 +171,12 @@ def load_requests(status_folder):
|
|
167 |
requests_data = []
|
168 |
folder_path_in_repo = status_folder # 'pending', 'finished', or 'failed'
|
169 |
|
170 |
-
hf_api_token = os.environ.get('HF_TOKEN', None)
|
171 |
-
|
172 |
try:
|
173 |
-
#
|
174 |
files_info = api.list_repo_files(
|
175 |
repo_id=DATASET_REPO_ID,
|
176 |
repo_type="dataset",
|
177 |
-
token=
|
178 |
)
|
179 |
except Exception as e:
|
180 |
print(f"Error accessing dataset repository: {e}")
|
@@ -190,7 +192,7 @@ def load_requests(status_folder):
|
|
190 |
repo_id=DATASET_REPO_ID,
|
191 |
filename=file_path,
|
192 |
repo_type="dataset",
|
193 |
-
token=
|
194 |
)
|
195 |
# Load JSON data
|
196 |
with open(local_file_path, 'r') as f:
|
|
|
12 |
|
13 |
results_dir = Path(__file__).parent / "results"
|
14 |
|
15 |
+
# Cache the HF token to avoid multiple os.environ lookups.
|
16 |
+
HF_TOKEN = os.environ.get('HF_TOKEN', None)
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
# Add a helper to load JSON results with optional formatting.
|
19 |
+
def load_json_results(file_path: Path, prepare_for_display=False, sort_col=None, drop_cols=None):
|
20 |
+
if file_path.exists():
|
21 |
+
df = pd.read_json(file_path)
|
22 |
+
else:
|
23 |
+
raise FileNotFoundError(f"File '{file_path}' not found.")
|
24 |
if prepare_for_display:
|
25 |
+
# Apply common mapping for model link formatting.
|
26 |
+
df[["Model"]] = df[["Model"]].applymap(lambda x: f'<a href="https://huggingface.co/{x}" target="_blank">{x}</a>')
|
27 |
+
if drop_cols is not None:
|
28 |
+
df.drop(columns=drop_cols, inplace=True)
|
29 |
+
if sort_col is not None:
|
30 |
+
df.sort_values(sort_col, ascending=False, inplace=True)
|
31 |
return df
|
32 |
|
33 |
+
def load_retrieval_results(prepare_for_display=False):
|
34 |
+
dataframe_path = results_dir / "retrieval_results.json"
|
35 |
+
return load_json_results(
|
36 |
+
dataframe_path,
|
37 |
+
prepare_for_display=prepare_for_display,
|
38 |
+
sort_col="Web Search Dataset (Overall Score)",
|
39 |
+
drop_cols=["Revision", "Precision", "Task"]
|
40 |
+
)
|
41 |
+
|
42 |
def load_reranking_results(prepare_for_display=False):
|
|
|
43 |
dataframe_path = results_dir / "reranking_results.json"
|
44 |
+
return load_json_results(
|
45 |
+
dataframe_path,
|
46 |
+
prepare_for_display=prepare_for_display,
|
47 |
+
sort_col="Overall Score"
|
48 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
def get_model_info(model_id, verbose=False):
|
51 |
model_info = api.model_info(model_id)
|
|
|
153 |
|
154 |
# Upload the submission to the dataset repository
|
155 |
try:
|
|
|
156 |
api.upload_file(
|
157 |
path_or_fileobj=submission_json.encode('utf-8'),
|
158 |
path_in_repo=file_path_in_repo,
|
159 |
repo_id=DATASET_REPO_ID,
|
160 |
repo_type="dataset",
|
161 |
+
token=HF_TOKEN
|
162 |
)
|
163 |
except Exception as e:
|
164 |
print(f"Error uploading file: {e}")
|
|
|
171 |
requests_data = []
|
172 |
folder_path_in_repo = status_folder # 'pending', 'finished', or 'failed'
|
173 |
|
|
|
|
|
174 |
try:
|
175 |
+
# Use the cached token
|
176 |
files_info = api.list_repo_files(
|
177 |
repo_id=DATASET_REPO_ID,
|
178 |
repo_type="dataset",
|
179 |
+
token=HF_TOKEN
|
180 |
)
|
181 |
except Exception as e:
|
182 |
print(f"Error accessing dataset repository: {e}")
|
|
|
192 |
repo_id=DATASET_REPO_ID,
|
193 |
filename=file_path,
|
194 |
repo_type="dataset",
|
195 |
+
token=HF_TOKEN
|
196 |
)
|
197 |
# Load JSON data
|
198 |
with open(local_file_path, 'r') as f:
|