rahmanidashti commited on
Commit
a77a138
·
1 Parent(s): 730a691

upgrade leaderboard

Browse files
.gitignore CHANGED
@@ -11,3 +11,5 @@ eval-results/
11
  eval-queue-bk/
12
  eval-results-bk/
13
  logs/
 
 
 
11
  eval-queue-bk/
12
  eval-results-bk/
13
  logs/
14
+
15
+ *.DS_Store
README.md CHANGED
@@ -1,13 +1,14 @@
1
  ---
2
- title: Llm As A Rel
3
  emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
  app_file: app.py
8
  pinned: true
9
- license: apache-2.0
10
- short_description: LLMs as Automatic Relevance Judgment
 
11
  ---
12
 
13
  # Start the configuration
 
1
  ---
2
+ title: Stark Leaderboard
3
  emoji: 🥇
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
  app_file: app.py
8
  pinned: true
9
+ license: mit
10
+ short_description: leaderboard of Semi-structured Retrieval Benchmark (STaRK)
11
+ hf_oauth: true
12
  ---
13
 
14
  # Start the configuration
app.py CHANGED
@@ -1,204 +1,972 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
-
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
  try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
 
 
 
91
 
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
-
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  with gr.Row():
192
- with gr.Accordion("📙 Citation", open=False):
193
- citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
- elem_id="citation-button",
198
- show_copy_button=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
 
2
  import pandas as pd
3
+ import numpy as np
4
+ import os
5
+ import re
6
+ from datetime import datetime
7
+ import json
8
+ import torch
9
+ from tqdm import tqdm
10
+ from concurrent.futures import ProcessPoolExecutor, as_completed
11
+ import smtplib
12
+ from email.mime.multipart import MIMEMultipart
13
+ from email.mime.text import MIMEText
14
+ from huggingface_hub import HfApi
15
+ import shutil
16
+ import tempfile
17
+
18
+ from stark_qa import load_qa
19
+ from stark_qa.evaluator import Evaluator
20
+
21
+ from utils.hub_storage import HubStorage
22
+ from utils.token_handler import TokenHandler
23
+
24
+ # Initialize storage once at startup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  try:
26
+ REPO_ID = "snap-stanford/stark-leaderboard" # Replace with your space name
27
+ hub_storage = HubStorage(REPO_ID)
28
+ except Exception as e:
29
+ raise RuntimeError(f"Failed to initialize HuggingFace Hub storage: {e}")
30
+
31
+
32
+ def process_single_instance(args):
33
+ idx, eval_csv, qa_dataset, evaluator, eval_metrics = args
34
+ query, query_id, answer_ids, meta_info = qa_dataset[idx]
35
+
36
+ try:
37
+ pred_rank = eval_csv[eval_csv['query_id'] == query_id]['pred_rank'].item()
38
+ except IndexError:
39
+ raise IndexError(f'Error when processing query_id={query_id}, please make sure the predicted results exist for this query.')
40
+ except Exception as e:
41
+ raise RuntimeError(f'Unexpected error occurred while fetching prediction rank for query_id={query_id}: {e}')
42
+
43
+ if isinstance(pred_rank, str):
44
+ try:
45
+ pred_rank = eval(pred_rank)
46
+ except SyntaxError as e:
47
+ raise ValueError(f'Failed to parse pred_rank as a list for query_id={query_id}: {e}')
48
+
49
+ if not isinstance(pred_rank, list):
50
+ raise TypeError(f'Error when processing query_id={query_id}, expected pred_rank to be a list but got {type(pred_rank)}.')
51
+
52
+ pred_dict = {pred_rank[i]: -i for i in range(min(100, len(pred_rank)))}
53
+ answer_ids = torch.LongTensor(answer_ids)
54
+ result = evaluator.evaluate(pred_dict, answer_ids, metrics=eval_metrics)
55
+
56
+ result["idx"], result["query_id"] = idx, query_id
57
+ return result
58
+
59
+
60
+ def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int = 4):
61
+ candidate_ids_dict = {
62
+ 'amazon': [i for i in range(957192)],
63
+ 'mag': [i for i in range(1172724, 1872968)],
64
+ 'prime': [i for i in range(129375)]
65
+ }
66
+ try:
67
+ eval_csv = pd.read_csv(csv_path)
68
+ if 'query_id' not in eval_csv.columns:
69
+ raise ValueError('No `query_id` column found in the submitted csv.')
70
+ if 'pred_rank' not in eval_csv.columns:
71
+ raise ValueError('No `pred_rank` column found in the submitted csv.')
72
+
73
+ eval_csv = eval_csv[['query_id', 'pred_rank']]
74
+
75
+ if dataset not in candidate_ids_dict:
76
+ raise ValueError(f"Invalid dataset '{dataset}', expected one of {list(candidate_ids_dict.keys())}.")
77
+ if split not in ['test', 'test-0.1', 'human_generated_eval']:
78
+ raise ValueError(f"Invalid split '{split}', expected one of ['test', 'test-0.1', 'human_generated_eval'].")
79
+
80
+ evaluator = Evaluator(candidate_ids_dict[dataset])
81
+ eval_metrics = ['hit@1', 'hit@5', 'recall@20', 'mrr']
82
+ qa_dataset = load_qa(dataset, human_generated_eval=split == 'human_generated_eval')
83
+ split_idx = qa_dataset.get_idx_split()
84
+ all_indices = split_idx[split].tolist()
85
+
86
+ results_list = []
87
+ query_ids = []
88
+
89
+ # Prepare args for each worker
90
+ args = [(idx, eval_csv, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
91
+
92
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
93
+ futures = [executor.submit(process_single_instance, arg) for arg in args]
94
+ for future in tqdm(as_completed(futures), total=len(futures)):
95
+ result = future.result() # This will raise an error if the worker encountered one
96
+ results_list.append(result)
97
+ query_ids.append(result['query_id'])
98
+
99
+ # Concatenate results and compute final metrics
100
+ eval_csv = pd.concat([eval_csv, pd.DataFrame(results_list)], ignore_index=True)
101
+ final_results = {
102
+ metric: np.mean(eval_csv[eval_csv['query_id'].isin(query_ids)][metric]) for metric in eval_metrics
103
+ }
104
+ return final_results
105
+
106
+ except pd.errors.EmptyDataError:
107
+ return "Error: The CSV file is empty or could not be read. Please check the file and try again."
108
+ except FileNotFoundError:
109
+ return f"Error: The file {csv_path} could not be found. Please check the file path and try again."
110
+ except Exception as error:
111
+ return f"{error}"
112
+
113
+
114
+ # Data dictionaries for leaderboard
115
+ data_synthesized_full = {
116
+ 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2'],
117
+ 'STARK-AMAZON_Hit@1': [44.94, 15.29, 30.96, 26.56, 39.16, 40.93, 21.74, 42.08, 40.07, 46.10],
118
+ 'STARK-AMAZON_Hit@5': [67.42, 47.93, 51.06, 50.01, 62.73, 64.37, 41.65, 66.87, 64.98, 66.02],
119
+ 'STARK-AMAZON_R@20': [53.77, 44.49, 41.95, 52.05, 53.29, 54.28, 33.22, 56.52, 55.12, 53.44],
120
+ 'STARK-AMAZON_MRR': [55.30, 30.20, 40.66, 37.75, 50.35, 51.60, 31.47, 53.46, 51.55, 55.51],
121
+ 'STARK-MAG_Hit@1': [25.85, 10.51, 21.96, 12.88, 29.08, 30.06, 18.01, 37.90, 25.92, 31.18],
122
+ 'STARK-MAG_Hit@5': [45.25, 35.23, 36.50, 39.01, 49.61, 50.58, 34.85, 56.74, 50.43, 46.42],
123
+ 'STARK-MAG_R@20': [45.69, 42.11, 35.32, 46.97, 48.36, 50.49, 35.46, 46.40, 50.80, 43.94],
124
+ 'STARK-MAG_MRR': [34.91, 21.34, 29.14, 29.12, 38.62, 39.66, 26.10, 47.25, 36.94, 38.39],
125
+ 'STARK-PRIME_Hit@1': [12.75, 4.46, 6.53, 8.85, 12.63, 10.85, 10.10, 15.57, 15.10, 11.75],
126
+ 'STARK-PRIME_Hit@5': [27.92, 21.85, 15.67, 21.35, 31.49, 30.23, 22.49, 33.42, 33.56, 23.85],
127
+ 'STARK-PRIME_R@20': [31.25, 30.13, 16.52, 29.63, 36.00, 37.83, 26.34, 39.09, 38.05, 25.04],
128
+ 'STARK-PRIME_MRR': [19.84, 12.38, 11.05, 14.73, 21.41, 19.99, 16.12, 24.11, 23.49, 17.39]
129
+ }
130
+
131
+ data_synthesized_10 = {
132
+ 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
133
+ 'STARK-AMAZON_Hit@1': [42.68, 16.46, 30.09, 25.00, 39.02, 43.29, 18.90, 43.29, 40.85, 44.31, 45.49, 44.79],
134
+ 'STARK-AMAZON_Hit@5': [67.07, 50.00, 49.27, 48.17, 64.02, 67.68, 37.80, 71.34, 62.80, 65.24, 71.13, 71.17],
135
+ 'STARK-AMAZON_R@20': [54.48, 42.15, 41.91, 51.65, 49.30, 56.04, 34.73, 56.14, 52.47, 51.00, 53.77, 55.35],
136
+ 'STARK-AMAZON_MRR': [54.02, 30.20, 39.30, 36.87, 50.32, 54.20, 28.76, 55.07, 51.54, 55.07, 55.91, 55.69],
137
+ 'STARK-MAG_Hit@1': [27.81, 11.65, 22.89, 12.03, 28.20, 34.59, 19.17, 38.35, 25.56, 31.58, 36.54, 40.90],
138
+ 'STARK-MAG_Hit@5': [45.48, 36.84, 37.26, 37.97, 52.63, 50.75, 33.46, 58.64, 50.37, 47.36, 53.17, 58.18],
139
+ 'STARK-MAG_R@20': [44.59, 42.30, 44.16, 47.98, 49.25, 50.75, 29.85, 46.38, 53.03, 45.72, 48.36, 48.60],
140
+ 'STARK-MAG_MRR': [35.97, 21.82, 30.00, 28.70, 38.55, 42.90, 26.06, 48.25, 36.82, 38.98, 44.15, 49.00],
141
+ 'STARK-PRIME_Hit@1': [13.93, 5.00, 6.78, 7.14, 15.36, 12.14, 9.29, 16.79, 15.36, 15.00, 17.79, 18.28],
142
+ 'STARK-PRIME_Hit@5': [31.07, 23.57, 16.15, 17.14, 31.07, 31.42, 20.7, 34.29, 32.86, 26.07, 36.90, 37.28],
143
+ 'STARK-PRIME_R@20': [32.84, 30.50, 17.07, 32.95, 37.88, 37.34, 25.54, 41.11, 40.99, 27.78, 35.57, 34.05],
144
+ 'STARK-PRIME_MRR': [21.68, 13.50, 11.42, 16.27, 23.50, 21.23, 15.00, 24.99, 23.70, 19.98, 26.27, 26.55]
145
+ }
146
+
147
+ data_human_generated = {
148
+ 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
149
+ 'STARK-AMAZON_Hit@1': [27.16, 16.05, 25.93, 22.22, 39.50, 35.80, 29.63, 40.74, 46.91, 33.33, 53.09, 50.62],
150
+ 'STARK-AMAZON_Hit@5': [51.85, 39.51, 54.32, 49.38, 64.19, 62.96, 46.91, 71.60, 72.84, 55.56, 74.07, 75.31],
151
+ 'STARK-AMAZON_R@20': [29.23, 15.23, 23.69, 21.54, 35.46, 33.01, 21.21, 36.30, 40.22, 29.03, 35.46, 35.46],
152
+ 'STARK-AMAZON_MRR': [18.79, 27.21, 37.12, 31.33, 52.65, 47.84, 38.61, 53.21, 58.74, 43.77, 62.11, 61.06],
153
+ 'STARK-MAG_Hit@1': [32.14, 4.72, 25.00, 20.24, 28.57, 22.62, 16.67, 34.52, 23.81, 33.33, 38.10, 36.90],
154
+ 'STARK-MAG_Hit@5': [41.67, 9.52, 30.95, 26.19, 41.67, 36.90, 28.57, 44.04, 41.67, 36.90, 45.24, 46.43],
155
+ 'STARK-MAG_R@20': [32.46, 25.00, 27.24, 28.76, 35.95, 32.44, 21.74, 34.57, 39.85, 30.50, 35.95, 35.95],
156
+ 'STARK-MAG_MRR': [37.42, 7.90, 27.98, 25.53, 35.81, 29.68, 21.59, 38.72, 31.43, 35.97, 42.00, 40.65],
157
+ 'STARK-PRIME_Hit@1': [22.45, 2.04, 7.14, 6.12, 17.35, 16.33, 9.18, 25.51, 24.49, 15.31, 28.57, 28.57],
158
+ 'STARK-PRIME_Hit@5': [41.84, 9.18, 13.27, 13.27, 34.69, 32.65, 21.43, 41.84, 39.80, 26.53, 46.94, 44.90],
159
+ 'STARK-PRIME_R@20': [42.32, 10.69, 11.72, 17.62, 41.09, 39.01, 26.77, 48.10, 47.21, 25.56, 41.61, 41.61],
160
+ 'STARK-PRIME_MRR': [30.37, 7.05, 10.07, 9.39, 26.35, 24.33, 15.24, 34.28, 32.98, 19.67, 36.32, 34.82]
161
+ }
162
+
163
+ # Initialize DataFrames
164
+ df_synthesized_full = pd.DataFrame(data_synthesized_full)
165
+ df_synthesized_10 = pd.DataFrame(data_synthesized_10)
166
+ df_human_generated = pd.DataFrame(data_human_generated)
167
+
168
+ # Model type definitions
169
+ model_types = {
170
+ 'Sparse Retriever': ['BM25'],
171
+ 'Small Dense Retrievers': ['DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)'],
172
+ 'LLM-based Dense Retrievers': ['ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b'],
173
+ 'Multivector Retrievers': ['multi-ada-002', 'ColBERTv2'],
174
+ 'LLM Rerankers': ['Claude3 Reranker', 'GPT4 Reranker'],
175
+ 'Others': [] # Will be populated dynamically with submitted models
176
+ }
177
+
178
+ # Submission form validation functions
179
+ def validate_email(email_str):
180
+ """Validate email format(s)"""
181
+ emails = [e.strip() for e in email_str.split(';')]
182
+ email_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
183
+ return all(email_pattern.match(email) for email in emails)
184
+
185
+ def validate_github_url(url):
186
+ """Validate GitHub URL format"""
187
+ github_pattern = re.compile(
188
+ r'^https?:\/\/(?:www\.)?github\.com\/[\w-]+\/[\w.-]+\/?$'
189
  )
190
+ return bool(github_pattern.match(url))
191
+
192
+ def validate_csv(file_obj):
193
+ """Validate CSV file format and content"""
194
+ try:
195
+ df = pd.read_csv(file_obj.name)
196
+ required_cols = ['query_id', 'pred_rank']
197
+
198
+ if not all(col in df.columns for col in required_cols):
199
+ return False, "CSV must contain 'query_id' and 'pred_rank' columns"
200
+
201
+ try:
202
+ first_rank = eval(df['pred_rank'].iloc[0]) if isinstance(df['pred_rank'].iloc[0], str) else df['pred_rank'].iloc[0]
203
+ if not isinstance(first_rank, list) or len(first_rank) < 20:
204
+ return False, "pred_rank must be a list with at least 20 candidates"
205
+ except:
206
+ return False, "Invalid pred_rank format"
207
+
208
+ return True, "Valid CSV file"
209
+ except Exception as e:
210
+ return False, f"Error processing CSV: {str(e)}"
211
 
212
+ def sanitize_name(name):
213
+ """Sanitize name for file system use"""
214
+ return re.sub(r'[^a-zA-Z0-9]', '_', name)
215
 
216
+ def read_json_from_hub(api: HfApi, repo_id: str, file_path: str) -> dict:
217
+ """
218
+ Read and parse JSON file from HuggingFace Hub.
219
+
220
+ Args:
221
+ api: HuggingFace API instance
222
+ repo_id: Repository ID
223
+ file_path: Path to file in repository
224
+
225
+ Returns:
226
+ dict: Parsed JSON content
227
+ """
228
+ try:
229
+ # Download the file content as bytes
230
+ content = api.hf_hub_download(
231
+ repo_id=repo_id,
232
+ filename=file_path,
233
+ repo_type="space"
234
+ )
235
+
236
+ # Read and parse JSON
237
+ with open(content, 'r') as f:
238
+ return json.load(f)
239
+ except Exception as e:
240
+ print(f"Error reading JSON file {file_path}: {str(e)}")
241
+ return None
242
+
243
+ def scan_submissions_directory():
244
+ """
245
+ Scans the submissions directory and updates the model types dictionary
246
+ with submitted models.
247
+ """
248
+ try:
249
+ # Initialize HuggingFace API
250
+ api = HfApi()
251
+
252
+ # Track submissions for each split
253
+ submissions_by_split = {
254
+ 'test': [],
255
+ 'test-0.1': [],
256
+ 'human_generated_eval': []
257
+ }
258
+
259
+ # Get all files from repository
260
+ try:
261
+ all_files = api.list_repo_files(
262
+ repo_id=REPO_ID,
263
+ repo_type="space"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  )
265
+ # Filter for files in submissions directory
266
+ repo_files = [f for f in all_files if f.startswith('submissions/')]
267
+ except Exception as e:
268
+ print(f"Error listing repository contents: {str(e)}")
269
+ return submissions_by_split
270
+
271
+ # Group files by team folders
272
+ folder_files = {}
273
+ for filepath in repo_files:
274
+ parts = filepath.split('/')
275
+ if len(parts) < 3: # Need at least submissions/team_folder/file
276
+ continue
277
+
278
+ folder_name = parts[1] # team_folder name
279
+ if folder_name not in folder_files:
280
+ folder_files[folder_name] = []
281
+ folder_files[folder_name].append(filepath)
282
+
283
+ # Process each team folder
284
+ for folder_name, files in folder_files.items():
285
+ try:
286
+ # Find latest.json in this folder
287
+ latest_file = next((f for f in files if f.endswith('latest.json')), None)
288
+ if not latest_file:
289
+ print(f"No latest.json found in {folder_name}")
290
+ continue
291
+
292
+ # Read latest.json
293
+ latest_info = read_json_from_hub(api, REPO_ID, latest_file)
294
+ if not latest_info:
295
+ print(f"Failed to read latest.json for {folder_name}")
296
+ continue
297
+
298
+ timestamp = latest_info.get('latest_submission')
299
+ if not timestamp:
300
+ print(f"No timestamp found in latest.json for {folder_name}")
301
+ continue
302
+
303
+ # Find metadata file for latest submission
304
+ metadata_file = next(
305
+ (f for f in files if f.endswith(f'metadata_{timestamp}.json')),
306
+ None
307
+ )
308
+ if not metadata_file:
309
+ print(f"No matching metadata file found for {folder_name} timestamp {timestamp}")
310
+ continue
311
+
312
+ # Read metadata file
313
+ submission_data = read_json_from_hub(api, REPO_ID, metadata_file)
314
+ if not submission_data:
315
+ print(f"Failed to read metadata for {folder_name}")
316
+ continue
317
+
318
+ if latest_info.get('status') != 'approved':
319
+ print(f"Skipping unapproved submission in {folder_name}")
320
+ continue
321
+
322
+ # Add to submissions by split
323
+ split = submission_data.get('Split')
324
+ if split in submissions_by_split:
325
+ submissions_by_split[split].append(submission_data)
326
+
327
+ # Update model types if necessary
328
+ method_name = submission_data.get('Method Name')
329
+ model_type = submission_data.get('Model Type', 'Others')
330
+
331
+ # Add to model type if it's a new method
332
+ method_exists = any(method_name in methods for methods in model_types.values())
333
+ if not method_exists and model_type in model_types:
334
+ model_types[model_type].append(method_name)
335
+
336
+ except Exception as e:
337
+ print(f"Error processing folder {folder_name}: {str(e)}")
338
+ continue
339
+
340
+ return submissions_by_split
341
+
342
+ except Exception as e:
343
+ print(f"Error scanning submissions directory: {str(e)}")
344
+ return None
345
+
346
+ def initialize_leaderboard():
347
+ """
348
+ Initialize the leaderboard with baseline results and submitted results.
349
+ """
350
+ global df_synthesized_full, df_synthesized_10, df_human_generated
351
+
352
+ try:
353
+ # First, initialize with baseline results
354
+ df_synthesized_full = pd.DataFrame(data_synthesized_full)
355
+ df_synthesized_10 = pd.DataFrame(data_synthesized_10)
356
+ df_human_generated = pd.DataFrame(data_human_generated)
357
+
358
+ print("Initialized with baseline results")
359
+
360
+ # Then scan and add submitted results
361
+ submissions = scan_submissions_directory()
362
+ if submissions:
363
+ for split, split_submissions in submissions.items():
364
+ for submission in split_submissions:
365
+ if submission.get('results'): # Make sure we have results
366
+ # Update appropriate DataFrame based on split
367
+ if split == 'test':
368
+ df_to_update = df_synthesized_full
369
+ elif split == 'test-0.1':
370
+ df_to_update = df_synthesized_10
371
+ else: # human_generated_eval
372
+ df_to_update = df_human_generated
373
+
374
+ # Prepare new row data
375
+ new_row = {
376
+ 'Method': submission['Method Name'],
377
+ f'STARK-{submission["Dataset"].upper()}_Hit@1': submission['results']['hit@1'],
378
+ f'STARK-{submission["Dataset"].upper()}_Hit@5': submission['results']['hit@5'],
379
+ f'STARK-{submission["Dataset"].upper()}_R@20': submission['results']['recall@20'],
380
+ f'STARK-{submission["Dataset"].upper()}_MRR': submission['results']['mrr']
381
+ }
382
+
383
+ # Update existing row or add new one
384
+ method_mask = df_to_update['Method'] == submission['Method Name']
385
+ if method_mask.any():
386
+ for col in new_row:
387
+ df_to_update.loc[method_mask, col] = new_row[col]
388
+ else:
389
+ df_to_update.loc[len(df_to_update)] = new_row
390
+
391
+ print("Leaderboard initialization complete")
392
+
393
+ except Exception as e:
394
+ print(f"Error initializing leaderboard: {str(e)}")
395
+
396
+ def get_file_content(file_path):
397
+ """
398
+ Helper function to safely read file content from HuggingFace repository
399
+ """
400
+ try:
401
+ api = HfApi()
402
+ content_path = api.hf_hub_download(
403
+ repo_id=REPO_ID,
404
+ filename=file_path,
405
+ repo_type="space"
406
+ )
407
+ with open(content_path, 'r') as f:
408
+ return f.read()
409
+ except Exception as e:
410
+ print(f"Error reading file {file_path}: {str(e)}")
411
+ return None
412
+
413
+ def save_submission(submission_data, csv_file):
414
+ """
415
+ Save submission data and CSV file using model_name_team_name format
416
+
417
+ Args:
418
+ submission_data (dict): Metadata and results for the submission
419
+ csv_file: The uploaded CSV file object
420
+ """
421
+ # Create folder name from model name and team name
422
+ model_name_clean = sanitize_name(submission_data['Method Name'])
423
+ team_name_clean = sanitize_name(submission_data['Team Name'])
424
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
425
+
426
+ # Create folder name: model_name_team_name
427
+ folder_name = f"{model_name_clean}_{team_name_clean}"
428
+ submission_id = f"{folder_name}_{timestamp}"
429
+
430
+ # Create submission directory structure
431
+ base_dir = "submissions"
432
+ submission_dir = os.path.join(base_dir, folder_name)
433
+ os.makedirs(submission_dir, exist_ok=True)
434
+
435
+ # Save CSV file with timestamp to allow multiple submissions
436
+ csv_filename = f"predictions_{timestamp}.csv"
437
+ csv_path = os.path.join(submission_dir, csv_filename)
438
+ if hasattr(csv_file, 'name'):
439
+ with open(csv_file.name, 'rb') as source, open(csv_path, 'wb') as target:
440
+ target.write(source.read())
441
+
442
+ # Add file paths to submission data
443
+ submission_data.update({
444
+ "csv_path": csv_path,
445
+ "submission_id": submission_id,
446
+ "folder_name": folder_name
447
+ })
448
+
449
+ # Save metadata as JSON with timestamp
450
+ metadata_path = os.path.join(submission_dir, f"metadata_{timestamp}.json")
451
+ with open(metadata_path, 'w') as f:
452
+ json.dump(submission_data, f, indent=4)
453
+
454
+ # Update latest.json to track most recent submission
455
+ latest_path = os.path.join(submission_dir, "latest.json")
456
+ with open(latest_path, 'w') as f:
457
+ json.dump({
458
+ "latest_submission": timestamp,
459
+ "status": "pending_review",
460
+ "method_name": submission_data['Method Name']
461
+ }, f, indent=4)
462
+
463
+ return submission_id
464
+
465
+ def update_leaderboard_data(submission_data):
466
+ """
467
+ Update leaderboard data with new submission results
468
+ Only uses model name in the displayed table
469
+ """
470
+ global df_synthesized_full, df_synthesized_10, df_human_generated
471
+
472
+ # Determine which DataFrame to update based on split
473
+ split_to_df = {
474
+ 'test': df_synthesized_full,
475
+ 'test-0.1': df_synthesized_10,
476
+ 'human_generated_eval': df_human_generated
477
+ }
478
+
479
+ df_to_update = split_to_df[submission_data['Split']]
480
+
481
+ # Prepare new row data
482
+ new_row = {
483
+ 'Method': submission_data['Method Name'], # Only use method name in table
484
+ f'STARK-{submission_data["Dataset"].upper()}_Hit@1': submission_data['results']['hit@1'],
485
+ f'STARK-{submission_data["Dataset"].upper()}_Hit@5': submission_data['results']['hit@5'],
486
+ f'STARK-{submission_data["Dataset"].upper()}_R@20': submission_data['results']['recall@20'],
487
+ f'STARK-{submission_data["Dataset"].upper()}_MRR': submission_data['results']['mrr']
488
+ }
489
+
490
+ # Check if method already exists
491
+ method_mask = df_to_update['Method'] == submission_data['Method Name']
492
+ if method_mask.any():
493
+ # Update existing row
494
+ for col in new_row:
495
+ df_to_update.loc[method_mask, col] = new_row[col]
496
+ else:
497
+ # Add new row
498
+ df_to_update.loc[len(df_to_update)] = new_row
499
+
500
+ # Function to get emails from meta_data
501
+ def get_emails_from_metadata(meta_data):
502
+ """
503
+ Extracts emails from the meta_data dictionary.
504
+
505
+ Args:
506
+ meta_data (dict): The metadata dictionary that contains the 'Contact Email(s)' field.
507
+
508
+ Returns:
509
+ list: A list of email addresses.
510
+ """
511
+ return [email.strip() for email in meta_data.get("Contact Email(s)", "").split(";")]
512
+
513
+ # Function to format meta_data as an HTML table (without Prediction CSV)
514
+ def format_metadata_as_table(meta_data):
515
+ """
516
+ Formats metadata dictionary into an HTML table for the email.
517
+ Handles multiple contact emails separated by a semicolon.
518
+
519
+ Args:
520
+ meta_data (dict): Dictionary containing submission metadata.
521
+
522
+ Returns:
523
+ str: HTML string representing the metadata table.
524
+ """
525
+ table_rows = ""
526
+
527
+ for key, value in meta_data.items():
528
+ if key == "Contact Email(s)":
529
+ # Ensure that contact emails are split by semicolon
530
+ emails = value.split(';')
531
+ formatted_emails = "; ".join([email.strip() for email in emails])
532
+ table_rows += f"<tr><td><b>{key}</b></td><td>{formatted_emails}</td></tr>"
533
+ elif key != "Prediction CSV": # Exclude the Prediction CSV field
534
+ table_rows += f"<tr><td><b>{key}</b></td><td>{value}</td></tr>"
535
+
536
+ table_html = f"""
537
+ <table border="1" cellpadding="5" cellspacing="0">
538
+ {table_rows}
539
+ </table>
540
+ """
541
+ return table_html
542
 
543
+ # Function to get emails from meta_data
544
+ def get_emails_from_metadata(meta_data):
545
+ """
546
+ Extracts emails from the meta_data dictionary.
547
+
548
+ Args:
549
+ meta_data (dict): The metadata dictionary that contains the 'Contact Email(s)' field.
550
+
551
+ Returns:
552
+ list: A list of email addresses.
553
+ """
554
+ return [email.strip() for email in meta_data.get("Contact Email(s)", "").split(";")]
555
+
556
+ def format_evaluation_results(results):
557
+ """
558
+ Formats the evaluation results dictionary into a readable string.
559
+
560
+ Args:
561
+ results (dict): Dictionary containing evaluation metrics and their values.
562
+
563
+ Returns:
564
+ str: Formatted string of evaluation results.
565
+ """
566
+ result_lines = [f"{metric}: {value}" for metric, value in results.items()]
567
+ return "\n".join(result_lines)
568
+
569
+ def get_model_type_for_method(method_name):
570
+ """
571
+ Find the model type category for a given method name.
572
+ Returns 'Others' if not found in predefined categories.
573
+ """
574
+ for type_name, methods in model_types.items():
575
+ if method_name in methods:
576
+ return type_name
577
+ return 'Others'
578
+
579
+ def validate_model_type(method_name, selected_type):
580
+ """
581
+ Validate if the selected model type is appropriate for the method name.
582
+ Returns (is_valid, message).
583
+ """
584
+ # Check if method exists in any category
585
+ existing_type = None
586
+ for type_name, methods in model_types.items():
587
+ if method_name in methods:
588
+ existing_type = type_name
589
+ break
590
+
591
+ # If method exists, it must be submitted under its predefined category
592
+ if existing_type:
593
+ if existing_type != selected_type:
594
+ return False, f"This method name is already registered under '{existing_type}'. Please use the correct category."
595
+ return True, "Valid model type"
596
+
597
+ # For new methods, any category is valid
598
+ return True, "Valid model type"
599
+
600
+ def process_submission(
601
+ method_name, team_name, dataset, split, contact_email,
602
+ code_repo, csv_file, model_description, hardware, paper_link, model_type
603
+ ):
604
+ """Process and validate submission"""
605
+ temp_files = []
606
+ try:
607
+ # Input validation
608
+ if not all([method_name, team_name, dataset, split, contact_email, code_repo, csv_file, model_type]):
609
+ return "Error: Please fill in all required fields"
610
+
611
+ # Validate model type
612
+ is_valid, message = validate_model_type(method_name, model_type)
613
+ if not is_valid:
614
+ return f"Error: {message}"
615
+
616
+ # Create metadata
617
+ meta_data = {
618
+ "Method Name": method_name,
619
+ "Team Name": team_name,
620
+ "Dataset": dataset,
621
+ "Split": split,
622
+ "Contact Email(s)": contact_email,
623
+ "Code Repository": code_repo,
624
+ "Model Description": model_description,
625
+ "Hardware": hardware,
626
+ "(Optional) Paper link": paper_link,
627
+ "Model Type": model_type
628
+ }
629
+
630
+ # Generate folder name and timestamp
631
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
632
+ folder_name = f"{sanitize_name(method_name)}_{sanitize_name(team_name)}"
633
+
634
+ # Process CSV file
635
+ temp_csv_path = None
636
+ if isinstance(csv_file, str):
637
+ temp_csv_path = csv_file
638
+ else:
639
+ temp_fd, temp_csv_path = tempfile.mkstemp(suffix='.csv')
640
+ temp_files.append(temp_csv_path)
641
+ os.close(temp_fd)
642
+
643
+ if hasattr(csv_file, 'name'):
644
+ shutil.copy2(csv_file.name, temp_csv_path)
645
+ else:
646
+ with open(temp_csv_path, 'wb') as temp_file:
647
+ if hasattr(csv_file, 'seek'):
648
+ csv_file.seek(0)
649
+ if hasattr(csv_file, 'read'):
650
+ shutil.copyfileobj(csv_file, temp_file)
651
+ else:
652
+ temp_file.write(csv_file)
653
+
654
+ if not os.path.exists(temp_csv_path):
655
+ raise FileNotFoundError(f"Failed to create temporary CSV file at {temp_csv_path}")
656
+
657
+ # Compute metrics
658
+ results = compute_metrics(
659
+ csv_path=temp_csv_path,
660
+ dataset=dataset.lower(),
661
+ split=split,
662
+ num_workers=4
663
+ )
664
+
665
+ if isinstance(results, str):
666
+ # send_error_notification(meta_data, results)
667
+ return f"Evaluation error: {results}"
668
+
669
+ # Process results
670
+ processed_results = {
671
+ "hit@1": round(results['hit@1'] * 100, 2),
672
+ "hit@5": round(results['hit@5'] * 100, 2),
673
+ "recall@20": round(results['recall@20'] * 100, 2),
674
+ "mrr": round(results['mrr'] * 100, 2)
675
+ }
676
+
677
+ # Save files to HuggingFace Hub
678
+ try:
679
+ # 1. Save CSV file
680
+ csv_filename = f"predictions_{timestamp}.csv"
681
+ csv_path_in_repo = f"submissions/{folder_name}/{csv_filename}"
682
+ hub_storage.save_to_hub(
683
+ file_content=temp_csv_path,
684
+ path_in_repo=csv_path_in_repo,
685
+ commit_message=f"Add submission: {method_name} by {team_name}"
686
+ )
687
+
688
+ # 2. Save metadata
689
+ submission_data = {
690
+ **meta_data,
691
+ "results": processed_results,
692
+ "status": "approved", # or "pending_review"
693
+ "submission_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
694
+ "csv_path": csv_path_in_repo
695
+ }
696
+
697
+ metadata_fd, temp_metadata_path = tempfile.mkstemp(suffix='.json')
698
+ temp_files.append(temp_metadata_path)
699
+ os.close(metadata_fd)
700
+
701
+ with open(temp_metadata_path, 'w') as f:
702
+ json.dump(submission_data, f, indent=4)
703
+
704
+ metadata_path = f"submissions/{folder_name}/metadata_{timestamp}.json"
705
+ hub_storage.save_to_hub(
706
+ file_content=temp_metadata_path,
707
+ path_in_repo=metadata_path,
708
+ commit_message=f"Add metadata: {method_name} by {team_name}"
709
+ )
710
+
711
+ # 3. Create or update latest.json
712
+ latest_info = {
713
+ "latest_submission": timestamp,
714
+ "status": "approved", # or "pending_review"
715
+ "method_name": method_name,
716
+ "team_name": team_name
717
+ }
718
+
719
+ latest_fd, temp_latest_path = tempfile.mkstemp(suffix='.json')
720
+ temp_files.append(temp_latest_path)
721
+ os.close(latest_fd)
722
+
723
+ with open(temp_latest_path, 'w') as f:
724
+ json.dump(latest_info, f, indent=4)
725
+
726
+ latest_path = f"submissions/{folder_name}/latest.json"
727
+ hub_storage.save_to_hub(
728
+ file_content=temp_latest_path,
729
+ path_in_repo=latest_path,
730
+ commit_message=f"Update latest submission info for {method_name}"
731
+ )
732
+
733
+ except Exception as e:
734
+ raise RuntimeError(f"Failed to save files to HuggingFace Hub: {str(e)}")
735
+
736
+ # Send confirmation email and update leaderboard data
737
+ # send_submission_confirmation(meta_data, processed_results)
738
+ update_leaderboard_data(submission_data)
739
+
740
+ # Return success message
741
+ return f"""
742
+ Submission successful!
743
+
744
+ Evaluation Results:
745
+ Hit@1: {processed_results['hit@1']:.2f}%
746
+ Hit@5: {processed_results['hit@5']:.2f}%
747
+ Recall@20: {processed_results['recall@20']:.2f}%
748
+ MRR: {processed_results['mrr']:.2f}%
749
+
750
+ Your submission has been saved and a confirmation email has been sent to {contact_email}.
751
+ Once approved, your results will appear in the leaderboard under: {method_name}
752
+
753
+ You can find your submission at:
754
+ https://huggingface.co/spaces/{REPO_ID}/tree/main/submissions/{folder_name}
755
+
756
+ Please refresh the page to see your submission in the leaderboard.
757
+ """
758
+
759
+ except Exception as e:
760
+ error_message = f"Error processing submission: {str(e)}"
761
+ # send_error_notification(meta_data, error_message)
762
+ return error_message
763
+ finally:
764
+ # Clean up temporary files
765
+ for temp_file in temp_files:
766
+ try:
767
+ if os.path.exists(temp_file):
768
+ os.unlink(temp_file)
769
+ except Exception as e:
770
+ print(f"Warning: Failed to delete temporary file {temp_file}: {str(e)}")
771
+
772
+
773
+ def filter_by_model_type(df, selected_types):
774
+ """
775
+ Filter DataFrame by selected model types, including submitted models.
776
+ """
777
+ if not selected_types:
778
+ return df.head(0)
779
+
780
+ # Get all models from selected types
781
+ selected_models = []
782
+ for type_name in selected_types:
783
+ selected_models.extend(model_types[type_name])
784
+
785
+ # Filter DataFrame to include only selected models
786
+ return df[df['Method'].isin(selected_models)]
787
+
788
+ def format_dataframe(df, dataset):
789
+ columns = ['Method'] + [col for col in df.columns if dataset in col]
790
+ filtered_df = df[columns].copy()
791
+ filtered_df.columns = [col.split('_')[-1] if '_' in col else col for col in filtered_df.columns]
792
+ filtered_df = filtered_df.sort_values('MRR', ascending=False)
793
+ return filtered_df
794
+
795
+ def update_tables(selected_types):
796
+ """
797
+ Update tables based on selected model types.
798
+ Include all models from selected categories.
799
+ """
800
+ if not selected_types:
801
+ return [df.head(0) for df in [df_synthesized_full, df_synthesized_10, df_human_generated]]
802
+
803
+ filtered_df_full = filter_by_model_type(df_synthesized_full, selected_types)
804
+ filtered_df_10 = filter_by_model_type(df_synthesized_10, selected_types)
805
+ filtered_df_human = filter_by_model_type(df_human_generated, selected_types)
806
+
807
+ outputs = []
808
+ for df in [filtered_df_full, filtered_df_10, filtered_df_human]:
809
+ for dataset in ['AMAZON', 'MAG', 'PRIME']:
810
+ outputs.append(format_dataframe(df, f"STARK-{dataset}"))
811
+
812
+ return outputs
813
+
814
+ css = """
815
+ table > thead {
816
+ white-space: normal
817
+ }
818
+
819
+ table {
820
+ --cell-width-1: 250px
821
+ }
822
+
823
+ table > tbody > tr > td:nth-child(2) > div {
824
+ overflow-x: auto
825
+ }
826
+
827
+ .tab-nav {
828
+ border-bottom: 1px solid rgba(255, 255, 255, 0.1);
829
+ margin-bottom: 1rem;
830
+ }
831
+ """
832
+
833
+ # Main application
834
+ with gr.Blocks(css=css) as demo:
835
+ gr.Markdown("# Semi-structured Retrieval Benchmark (STaRK) Leaderboard")
836
+ gr.Markdown("Refer to the [STaRK paper](https://arxiv.org/pdf/2404.13207) for details on metrics, tasks and models.")
837
+
838
+ # Initialize leaderboard at startup
839
+ print("Starting leaderboard initialization...")
840
+ initialize_leaderboard()
841
+ print("Leaderboard initialization finished")
842
+
843
+ # Model type filter
844
+ model_type_filter = gr.CheckboxGroup(
845
+ choices=list(model_types.keys()),
846
+ value=list(model_types.keys()),
847
+ label="Model types",
848
+ interactive=True
849
+ )
850
+
851
+ # Initialize dataframes list
852
+ all_dfs = []
853
+
854
+ # Create nested tabs structure
855
+ with gr.Tabs() as outer_tabs:
856
+ with gr.TabItem("Synthesized (full)"):
857
+ with gr.Tabs() as inner_tabs1:
858
+ for dataset in ['AMAZON', 'MAG', 'PRIME']:
859
+ with gr.TabItem(dataset):
860
+ all_dfs.append(gr.DataFrame(interactive=False))
861
+
862
+ with gr.TabItem("Synthesized (10%)"):
863
+ with gr.Tabs() as inner_tabs2:
864
+ for dataset in ['AMAZON', 'MAG', 'PRIME']:
865
+ with gr.TabItem(dataset):
866
+ all_dfs.append(gr.DataFrame(interactive=False))
867
+
868
+ with gr.TabItem("Human-Generated"):
869
+ with gr.Tabs() as inner_tabs3:
870
+ for dataset in ['AMAZON', 'MAG', 'PRIME']:
871
+ with gr.TabItem(dataset):
872
+ all_dfs.append(gr.DataFrame(interactive=False))
873
+
874
+ # Submission section
875
+ gr.Markdown("---")
876
+ gr.Markdown("## Submit Your Results")
877
+ gr.Markdown("""
878
+ Submit your results to be included in the leaderboard. Please ensure your submission meets all requirements.
879
+ For questions, contact [email protected]. Detailed instructions can be referred at [submission instructions](https://docs.google.com/document/d/11coGjTmOEi9p9-PUq1oy0eTOj8f_8CVQhDl5_0FKT14/edit?usp=sharing).
880
+ """)
881
+
882
  with gr.Row():
883
+ with gr.Column():
884
+ method_name = gr.Textbox(
885
+ label="Method Name (max 25 chars)*",
886
+ placeholder="e.g., MyRetrievalModel-v1"
887
+ )
888
+ dataset = gr.Dropdown(
889
+ choices=["amazon", "mag", "prime"],
890
+ label="Dataset*",
891
+ value="amazon"
892
+ )
893
+ split = gr.Dropdown(
894
+ choices=["test", "test-0.1", "human_generated_eval"],
895
+ label="Split*",
896
+ value="test"
897
+ )
898
+ team_name = gr.Textbox(
899
+ label="Team Name (max 25 chars)*",
900
+ placeholder="e.g., Stanford NLP"
901
+ )
902
+ contact_email = gr.Textbox(
903
+ label="Contact Email(s)*",
904
905
+ )
906
+ model_type = gr.Dropdown(
907
+ choices=list(model_types.keys()),
908
+ label="Model Type*",
909
+ value="Others",
910
+ info="Select the appropriate category for your model"
911
  )
912
+
913
+
914
+ with gr.Column():
915
+ model_description = gr.Textbox(
916
+ label="Model Description*",
917
+ lines=3,
918
+ placeholder="Briefly describe how your retriever model works..."
919
+ )
920
+ code_repo = gr.Textbox(
921
+ label="Code Repository*",
922
+ placeholder="https://github.com/snap-stanford/stark-leaderboard"
923
+ )
924
+ hardware = gr.Textbox(
925
+ label="Hardware Specifications*",
926
+ placeholder="e.g., 4x NVIDIA A100 80GB"
927
+ )
928
+ csv_file = gr.File(
929
+ label="Prediction CSV*",
930
+ file_types=[".csv"],
931
+ type="filepath"
932
+ )
933
+ paper_link = gr.Textbox(
934
+ label="Paper Link (Optional)",
935
+ placeholder="https://arxiv.org/abs/..."
936
+ )
937
+
938
+ submit_btn = gr.Button("Submit", variant="primary")
939
+ result = gr.Textbox(label="Submission Status", interactive=False)
940
+
941
+
942
+ # Set up event handlers
943
+ model_type_filter.change(
944
+ update_tables,
945
+ inputs=[model_type_filter],
946
+ outputs=all_dfs
947
+ )
948
+
949
+ # Event handler for submission button
950
+ submit_btn.click(
951
+ fn=process_submission,
952
+ inputs=[
953
+ method_name, team_name, dataset, split, contact_email,
954
+ code_repo, csv_file, model_description, hardware, paper_link, model_type
955
+ ],
956
+ outputs=result
957
+ ).success( # Add a success handler to update tables after successful submission
958
+ fn=update_tables,
959
+ inputs=[model_type_filter],
960
+ outputs=all_dfs
961
+ )
962
+
963
+ # Initial table update
964
+ demo.load(
965
+ update_tables,
966
+ inputs=[model_type_filter],
967
+ outputs=all_dfs
968
+ )
969
+
970
 
971
+ # Launch the application
972
+ demo.launch()
 
 
requirements.txt CHANGED
@@ -10,7 +10,10 @@ matplotlib
10
  numpy
11
  pandas
12
  python-dateutil
 
13
  tqdm
14
  transformers
 
15
  tokenizers>=0.15.0
16
- sentencepiece
 
 
10
  numpy
11
  pandas
12
  python-dateutil
13
+ python-dotenv
14
  tqdm
15
  transformers
16
+ torch
17
  tokenizers>=0.15.0
18
+ sentencepiece
19
+ stark_qa
src/about.py CHANGED
@@ -21,11 +21,11 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">LLMJudge Leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- This is a leaderboard for LLMJudge challenge
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
 
21
 
22
 
23
  # Your leaderboard name
24
+ TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
+ Intro text
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
submissions/debug_submission_none/latest.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "latest_submission": "20241024_125801",
3
+ "status": "approved",
4
+ "method_name": "debug-submission",
5
+ "team_name": "none"
6
+ }
submissions/debug_submission_none/metadata_20241024_125801.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Method Name": "debug-submission",
3
+ "Team Name": "none",
4
+ "Dataset": "mag",
5
+ "Split": "human_generated_eval",
6
+ "Contact Email(s)": "none",
7
+ "Code Repository": "none",
8
+ "Model Description": "none",
9
+ "Hardware": "none",
10
+ "(Optional) Paper link": "none",
11
+ "Model Type": "Others",
12
+ "results": {
13
+ "hit@1": 28.57,
14
+ "hit@5": 41.67,
15
+ "recall@20": 35.95,
16
+ "mrr": 35.94
17
+ },
18
+ "status": "approved",
19
+ "submission_date": "2024-10-24 12:58:41",
20
+ "csv_path": "submissions/debug_submission_none/predictions_20241024_125801.csv"
21
+ }
submissions/debug_submission_none/predictions_20241024_125801.csv ADDED
The diff for this file is too large to render. See raw diff
 
utils/__init__.py ADDED
File without changes
utils/hub_storage.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from huggingface_hub import HfApi
3
+ from .token_handler import TokenHandler
4
+
5
+ class HubStorage:
6
+ def __init__(self, repo_id):
7
+ self.repo_id = repo_id
8
+ self.api = HfApi()
9
+
10
+ def get_file_content(self, file_path):
11
+ """
12
+ Get content of a file from the repository
13
+ """
14
+ try:
15
+ content = self.api.hf_hub_download(
16
+ repo_id=self.repo_id,
17
+ repo_type="space",
18
+ filename=file_path,
19
+ text=True
20
+ )
21
+ return content
22
+ except Exception as e:
23
+ print(f"Error reading file {file_path}: {str(e)}")
24
+ return None
25
+
26
+ def save_to_hub(self, file_content, path_in_repo, commit_message):
27
+ """
28
+ Save a file to the hub
29
+ """
30
+ try:
31
+ self.api.upload_file(
32
+ path_or_fileobj=file_content,
33
+ path_in_repo=path_in_repo,
34
+ repo_id=self.repo_id,
35
+ repo_type="space",
36
+ commit_message=commit_message
37
+ )
38
+ return True
39
+ except Exception as e:
40
+ print(f"Error saving file to hub: {str(e)}")
41
+ return False
utils/token_handler.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from huggingface_hub import HfApi
4
+ from pathlib import Path
5
+
6
+ class TokenHandler:
7
+ def __init__(self):
8
+ # Load environment variables from .env file if it exists
9
+ self.load_environment()
10
+ self.token = self._get_token()
11
+ self.api = HfApi()
12
+
13
+ def load_environment(self):
14
+ """Load environment variables from .env file"""
15
+ env_path = Path('.env')
16
+ if env_path.exists():
17
+ load_dotenv(env_path)
18
+
19
+ def _get_token(self) -> str:
20
+ """Get HuggingFace token from environment variables"""
21
+ token = os.getenv("HF_TOKEN")
22
+ if not token:
23
+ raise EnvironmentError(
24
+ "HF_TOKEN not found in environment variables. "
25
+ "Please set it up using one of these methods:\n"
26
+ "1. Create a .env file with HF_TOKEN=your_token\n"
27
+ "2. Set environment variable HF_TOKEN=your_token\n"
28
+ "3. Add HF_TOKEN to your HuggingFace Space secrets"
29
+ )
30
+ return token
31
+
32
+ def verify_token(self) -> bool:
33
+ """Verify if the token is valid by making a test API call"""
34
+ try:
35
+ # Try to get user information using the token
36
+ self.api.whoami(token=self.token)
37
+ return True
38
+ except Exception as e:
39
+ print(f"Token verification failed: {e}")
40
+ return False
41
+
42
+ def get_verified_token(self) -> str:
43
+ """Get token and verify it's working"""
44
+ if not self.verify_token():
45
+ raise ValueError(
46
+ "Invalid or expired HuggingFace token. "
47
+ "Please check your token at https://huggingface.co/settings/tokens"
48
+ )
49
+ return self.token
50
+
51
+ # Usage example
52
+ def initialize_hf_token():
53
+ """Initialize and verify HuggingFace token"""
54
+ try:
55
+ handler = TokenHandler()
56
+ token = handler.get_verified_token()
57
+ print("✓ HuggingFace token successfully verified")
58
+ return token
59
+ except Exception as e:
60
+ print(f"✗ Error initializing HuggingFace token: {e}")
61
+ return None
62
+
63
+ # Example of how to use in your main code
64
+ if __name__ == "__main__":
65
+ # Create .env file if it doesn't exist
66
+ if not Path('.env').exists():
67
+ print("Creating .env file template...")
68
+ with open('.env', 'w') as f:
69
+ f.write("HF_TOKEN=your_token_here\n")
70
+ print("Please edit .env file and add your HuggingFace token")
71
+
72
+ # Initialize token
73
+ token = initialize_hf_token()
74
+ if token:
75
+ print("Ready to use HuggingFace API")