Spaces:
Sleeping
Sleeping
Commit
·
43dd1e4
1
Parent(s):
0d282c0
upgrade leaderboard
Browse files
app.py
CHANGED
@@ -14,6 +14,8 @@ from email.mime.text import MIMEText
|
|
14 |
from huggingface_hub import HfApi
|
15 |
import shutil
|
16 |
import tempfile
|
|
|
|
|
17 |
|
18 |
from stark_qa import load_qa
|
19 |
from stark_qa.evaluator import Evaluator
|
@@ -23,7 +25,7 @@ from utils.token_handler import TokenHandler
|
|
23 |
|
24 |
# Initialize storage once at startup
|
25 |
try:
|
26 |
-
REPO_ID = "
|
27 |
hub_storage = HubStorage(REPO_ID)
|
28 |
except Exception as e:
|
29 |
raise RuntimeError(f"Failed to initialize HuggingFace Hub storage: {e}")
|
@@ -56,50 +58,63 @@ def process_single_instance(args):
|
|
56 |
result["idx"], result["query_id"] = idx, query_id
|
57 |
return result
|
58 |
|
59 |
-
|
60 |
def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int = 4):
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
candidate_ids_dict = {
|
62 |
'amazon': [i for i in range(957192)],
|
63 |
'mag': [i for i in range(1172724, 1872968)],
|
64 |
'prime': [i for i in range(129375)]
|
65 |
}
|
66 |
try:
|
67 |
-
eval_csv = pd.read_csv(csv_path)
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
# Prepare args for each worker
|
90 |
-
args = [(idx, eval_csv, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
|
91 |
|
92 |
-
with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
|
99 |
# Concatenate results and compute final metrics
|
100 |
-
eval_csv = pd.concat([eval_csv, pd.DataFrame(results_list)], ignore_index=True)
|
101 |
final_results = {
|
102 |
-
metric: np.mean(eval_csv[eval_csv['query_id'].isin(query_ids)][metric]) for metric in eval_metrics
|
|
|
|
|
103 |
}
|
104 |
return final_results
|
105 |
|
@@ -110,60 +125,65 @@ def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int =
|
|
110 |
except Exception as error:
|
111 |
return f"{error}"
|
112 |
|
113 |
-
|
114 |
# Data dictionaries for leaderboard
|
115 |
data_synthesized_full = {
|
116 |
-
'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)'
|
117 |
-
'
|
118 |
-
'
|
119 |
-
'STARK-AMAZON_R@20': [53.77, 44.49, 41.95, 52.05, 53.29, 54.28, 33.22, 56.52, 55.12, 53.44],
|
120 |
-
'STARK-AMAZON_MRR': [55.30, 30.20, 40.66, 37.75, 50.35, 51.60, 31.47, 53.46, 51.55, 55.51],
|
121 |
-
'STARK-MAG_Hit@1': [25.85, 10.51, 21.96, 12.88, 29.08, 30.06, 18.01, 37.90, 25.92, 31.18],
|
122 |
-
'STARK-MAG_Hit@5': [45.25, 35.23, 36.50, 39.01, 49.61, 50.58, 34.85, 56.74, 50.43, 46.42],
|
123 |
-
'STARK-MAG_R@20': [45.69, 42.11, 35.32, 46.97, 48.36, 50.49, 35.46, 46.40, 50.80, 43.94],
|
124 |
-
'STARK-MAG_MRR': [34.91, 21.34, 29.14, 29.12, 38.62, 39.66, 26.10, 47.25, 36.94, 38.39],
|
125 |
-
'STARK-PRIME_Hit@1': [12.75, 4.46, 6.53, 8.85, 12.63, 10.85, 10.10, 15.57, 15.10, 11.75],
|
126 |
-
'STARK-PRIME_Hit@5': [27.92, 21.85, 15.67, 21.35, 31.49, 30.23, 22.49, 33.42, 33.56, 23.85],
|
127 |
-
'STARK-PRIME_R@20': [31.25, 30.13, 16.52, 29.63, 36.00, 37.83, 26.34, 39.09, 38.05, 25.04],
|
128 |
-
'STARK-PRIME_MRR': [19.84, 12.38, 11.05, 14.73, 21.41, 19.99, 16.12, 24.11, 23.49, 17.39]
|
129 |
}
|
130 |
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
}
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
# Initialize DataFrames
|
164 |
df_synthesized_full = pd.DataFrame(data_synthesized_full)
|
165 |
-
df_synthesized_10 = pd.DataFrame(data_synthesized_10)
|
166 |
-
df_human_generated = pd.DataFrame(data_human_generated)
|
167 |
|
168 |
# Model type definitions
|
169 |
model_types = {
|
@@ -347,13 +367,14 @@ def initialize_leaderboard():
|
|
347 |
"""
|
348 |
Initialize the leaderboard with baseline results and submitted results.
|
349 |
"""
|
350 |
-
global df_synthesized_full, df_synthesized_10, df_human_generated
|
|
|
351 |
|
352 |
try:
|
353 |
# First, initialize with baseline results
|
354 |
df_synthesized_full = pd.DataFrame(data_synthesized_full)
|
355 |
-
df_synthesized_10 = pd.DataFrame(data_synthesized_10)
|
356 |
-
df_human_generated = pd.DataFrame(data_human_generated)
|
357 |
|
358 |
print("Initialized with baseline results")
|
359 |
|
@@ -766,7 +787,6 @@ def process_submission(method_name, team_name, dataset, split, contact_email, co
|
|
766 |
except Exception as e:
|
767 |
print(f"Warning: Failed to delete temporary file {temp_file}: {str(e)}")
|
768 |
|
769 |
-
|
770 |
def filter_by_model_type(df, selected_types):
|
771 |
"""
|
772 |
Filter DataFrame by selected model types, including submitted models.
|
@@ -795,19 +815,31 @@ def update_tables(selected_types):
|
|
795 |
Include all models from selected categories.
|
796 |
"""
|
797 |
if not selected_types:
|
798 |
-
return [df.head(0) for df in [df_synthesized_full, df_synthesized_10, df_human_generated]]
|
|
|
799 |
|
800 |
filtered_df_full = filter_by_model_type(df_synthesized_full, selected_types)
|
801 |
-
filtered_df_10 = filter_by_model_type(df_synthesized_10, selected_types)
|
802 |
-
filtered_df_human = filter_by_model_type(df_human_generated, selected_types)
|
803 |
|
804 |
outputs = []
|
805 |
-
for df in [filtered_df_full, filtered_df_10, filtered_df_human]:
|
806 |
-
|
807 |
-
|
|
|
808 |
|
809 |
return outputs
|
810 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
811 |
css = """
|
812 |
table > thead {
|
813 |
white-space: normal
|
@@ -829,8 +861,8 @@ table > tbody > tr > td:nth-child(2) > div {
|
|
829 |
|
830 |
# Main application
|
831 |
with gr.Blocks(css=css) as demo:
|
832 |
-
gr.Markdown("#
|
833 |
-
gr.Markdown("Refer to the [
|
834 |
|
835 |
# Initialize leaderboard at startup
|
836 |
print("Starting leaderboard initialization...")
|
@@ -870,20 +902,14 @@ with gr.Blocks(css=css) as demo:
|
|
870 |
|
871 |
# Submission section
|
872 |
|
873 |
-
#
|
874 |
-
test_data =
|
875 |
-
# Reshape the list into a 2D array where each row contains 4 elements
|
876 |
-
test_data = [test_data[i:i+4] for i in range(0, len(test_data), 4)]
|
877 |
-
# Create a DataFrame
|
878 |
-
df = pd.DataFrame(test_data, columns=['qid', 'Q0', 'pid', 'score'])
|
879 |
-
# Display the DataFrame
|
880 |
-
print(df)
|
881 |
|
882 |
gr.Markdown("---")
|
883 |
gr.Markdown("## Submit Your Results:")
|
884 |
gr.Markdown("""
|
885 |
Submit your results to be included in the leaderboard. Please ensure your submission meets all requirements.
|
886 |
-
For questions, contact
|
887 |
""")
|
888 |
|
889 |
with gr.Row():
|
@@ -933,8 +959,8 @@ with gr.Blocks(css=css) as demo:
|
|
933 |
placeholder="e.g., 4x NVIDIA A100 80GB"
|
934 |
)
|
935 |
csv_file = gr.File(
|
936 |
-
label="Prediction
|
937 |
-
file_types=[".
|
938 |
type="filepath"
|
939 |
)
|
940 |
paper_link = gr.Textbox(
|
@@ -945,7 +971,6 @@ with gr.Blocks(css=css) as demo:
|
|
945 |
submit_btn = gr.Button("Submit", variant="primary")
|
946 |
result = gr.Textbox(label="Submission Status", interactive=False)
|
947 |
|
948 |
-
|
949 |
# Set up event handlers
|
950 |
model_type_filter.change(
|
951 |
update_tables,
|
@@ -973,7 +998,6 @@ with gr.Blocks(css=css) as demo:
|
|
973 |
inputs=[model_type_filter],
|
974 |
outputs=all_dfs
|
975 |
)
|
976 |
-
|
977 |
|
978 |
# Launch the application
|
979 |
demo.launch()
|
|
|
14 |
from huggingface_hub import HfApi
|
15 |
import shutil
|
16 |
import tempfile
|
17 |
+
from sklearn.metrics import cohen_kappa_score
|
18 |
+
import krippendorff
|
19 |
|
20 |
from stark_qa import load_qa
|
21 |
from stark_qa.evaluator import Evaluator
|
|
|
25 |
|
26 |
# Initialize storage once at startup
|
27 |
try:
|
28 |
+
REPO_ID = "rahmanidashti/llm-as-a-rel" # Replace with your space name
|
29 |
hub_storage = HubStorage(REPO_ID)
|
30 |
except Exception as e:
|
31 |
raise RuntimeError(f"Failed to initialize HuggingFace Hub storage: {e}")
|
|
|
58 |
result["idx"], result["query_id"] = idx, query_id
|
59 |
return result
|
60 |
|
|
|
61 |
def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int = 4):
|
62 |
+
"""
|
63 |
+
computing the metrics for the evaluation.
|
64 |
+
|
65 |
+
Parameters:
|
66 |
+
csv_path (str): The path to the submission file for evaluation.
|
67 |
+
"""
|
68 |
candidate_ids_dict = {
|
69 |
'amazon': [i for i in range(957192)],
|
70 |
'mag': [i for i in range(1172724, 1872968)],
|
71 |
'prime': [i for i in range(129375)]
|
72 |
}
|
73 |
try:
|
74 |
+
# eval_csv = pd.read_csv(csv_path)
|
75 |
+
eval_csv = pd.read_csv(csv_path, sep=" ", header=None, names=['qid', 'Q0', 'docid', 'score'])
|
76 |
+
eval_csv['score'] = [0 if x < 0 else 3 if x > 3 else x for x in eval_csv['score']]
|
77 |
+
test_eval_df = pd.merge(test_data, eval_csv, on=['qid', 'docid'], how='outer')
|
78 |
+
cohen_kappa = cohen_kappa_score(test_eval_df['score_x'], test_eval_df['score_y'])
|
79 |
+
krippendorff_alpha = krippendorff.alpha(reliability_data=[test_eval_df['score_x'], test_eval_df['score_y']], value_domain=[0,1,2,3], level_of_measurement='ordinal')
|
80 |
+
|
81 |
+
# if 'query_id' not in eval_csv.columns:
|
82 |
+
# raise ValueError('No `query_id` column found in the submitted csv.')
|
83 |
+
# if 'pred_rank' not in eval_csv.columns:
|
84 |
+
# raise ValueError('No `pred_rank` column found in the submitted csv.')
|
85 |
+
|
86 |
+
# eval_csv = eval_csv[['query_id', 'pred_rank']]
|
87 |
+
|
88 |
+
# if dataset not in candidate_ids_dict:
|
89 |
+
# raise ValueError(f"Invalid dataset '{dataset}', expected one of {list(candidate_ids_dict.keys())}.")
|
90 |
+
# if split not in ['test', 'test-0.1', 'human_generated_eval']:
|
91 |
+
# raise ValueError(f"Invalid split '{split}', expected one of ['test', 'test-0.1', 'human_generated_eval'].")
|
92 |
+
|
93 |
+
# evaluator = Evaluator(candidate_ids_dict[dataset])
|
94 |
+
# eval_metrics = ['hit@1', 'hit@5', 'recall@20', 'mrr']
|
95 |
+
# qa_dataset = load_qa(dataset, human_generated_eval=split == 'human_generated_eval')
|
96 |
+
# split_idx = qa_dataset.get_idx_split()
|
97 |
+
# all_indices = split_idx[split].tolist()
|
98 |
+
|
99 |
+
# results_list = []
|
100 |
+
# query_ids = []
|
101 |
|
102 |
# Prepare args for each worker
|
103 |
+
# args = [(idx, eval_csv, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
|
104 |
|
105 |
+
# with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
106 |
+
# futures = [executor.submit(process_single_instance, arg) for arg in args]
|
107 |
+
# for future in tqdm(as_completed(futures), total=len(futures)):
|
108 |
+
# result = future.result() # This will raise an error if the worker encountered one
|
109 |
+
# results_list.append(result)
|
110 |
+
# query_ids.append(result['query_id'])
|
111 |
|
112 |
# Concatenate results and compute final metrics
|
113 |
+
# eval_csv = pd.concat([eval_csv, pd.DataFrame(results_list)], ignore_index=True)
|
114 |
final_results = {
|
115 |
+
# metric: np.mean(eval_csv[eval_csv['query_id'].isin(query_ids)][metric]) for metric in eval_metrics
|
116 |
+
'kappa': round(cohen_kappa, 4),
|
117 |
+
'alpha': round(krippendorff_alpha, 4)
|
118 |
}
|
119 |
return final_results
|
120 |
|
|
|
125 |
except Exception as error:
|
126 |
return f"{error}"
|
127 |
|
|
|
128 |
# Data dictionaries for leaderboard
|
129 |
data_synthesized_full = {
|
130 |
+
'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)'],
|
131 |
+
'LLMJudge-DL2023_Kappa': [44.94, 15.29, 30.96],
|
132 |
+
'LLMJudge-DL2023_Alpha': [67.42, 47.93, 51.06],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
}
|
134 |
|
135 |
+
# data_synthesized_full = {
|
136 |
+
# 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2'],
|
137 |
+
# 'STARK-AMAZON_Hit@1': [44.94, 15.29, 30.96, 26.56, 39.16, 40.93, 21.74, 42.08, 40.07, 46.10],
|
138 |
+
# 'STARK-AMAZON_Hit@5': [67.42, 47.93, 51.06, 50.01, 62.73, 64.37, 41.65, 66.87, 64.98, 66.02],
|
139 |
+
# 'STARK-AMAZON_R@20': [53.77, 44.49, 41.95, 52.05, 53.29, 54.28, 33.22, 56.52, 55.12, 53.44],
|
140 |
+
# 'STARK-AMAZON_MRR': [55.30, 30.20, 40.66, 37.75, 50.35, 51.60, 31.47, 53.46, 51.55, 55.51],
|
141 |
+
# 'STARK-MAG_Hit@1': [25.85, 10.51, 21.96, 12.88, 29.08, 30.06, 18.01, 37.90, 25.92, 31.18],
|
142 |
+
# 'STARK-MAG_Hit@5': [45.25, 35.23, 36.50, 39.01, 49.61, 50.58, 34.85, 56.74, 50.43, 46.42],
|
143 |
+
# 'STARK-MAG_R@20': [45.69, 42.11, 35.32, 46.97, 48.36, 50.49, 35.46, 46.40, 50.80, 43.94],
|
144 |
+
# 'STARK-MAG_MRR': [34.91, 21.34, 29.14, 29.12, 38.62, 39.66, 26.10, 47.25, 36.94, 38.39],
|
145 |
+
# 'STARK-PRIME_Hit@1': [12.75, 4.46, 6.53, 8.85, 12.63, 10.85, 10.10, 15.57, 15.10, 11.75],
|
146 |
+
# 'STARK-PRIME_Hit@5': [27.92, 21.85, 15.67, 21.35, 31.49, 30.23, 22.49, 33.42, 33.56, 23.85],
|
147 |
+
# 'STARK-PRIME_R@20': [31.25, 30.13, 16.52, 29.63, 36.00, 37.83, 26.34, 39.09, 38.05, 25.04],
|
148 |
+
# 'STARK-PRIME_MRR': [19.84, 12.38, 11.05, 14.73, 21.41, 19.99, 16.12, 24.11, 23.49, 17.39]
|
149 |
+
# }
|
150 |
+
|
151 |
+
# data_synthesized_10 = {
|
152 |
+
# 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
|
153 |
+
# 'STARK-AMAZON_Hit@1': [42.68, 16.46, 30.09, 25.00, 39.02, 43.29, 18.90, 43.29, 40.85, 44.31, 45.49, 44.79],
|
154 |
+
# 'STARK-AMAZON_Hit@5': [67.07, 50.00, 49.27, 48.17, 64.02, 67.68, 37.80, 71.34, 62.80, 65.24, 71.13, 71.17],
|
155 |
+
# 'STARK-AMAZON_R@20': [54.48, 42.15, 41.91, 51.65, 49.30, 56.04, 34.73, 56.14, 52.47, 51.00, 53.77, 55.35],
|
156 |
+
# 'STARK-AMAZON_MRR': [54.02, 30.20, 39.30, 36.87, 50.32, 54.20, 28.76, 55.07, 51.54, 55.07, 55.91, 55.69],
|
157 |
+
# 'STARK-MAG_Hit@1': [27.81, 11.65, 22.89, 12.03, 28.20, 34.59, 19.17, 38.35, 25.56, 31.58, 36.54, 40.90],
|
158 |
+
# 'STARK-MAG_Hit@5': [45.48, 36.84, 37.26, 37.97, 52.63, 50.75, 33.46, 58.64, 50.37, 47.36, 53.17, 58.18],
|
159 |
+
# 'STARK-MAG_R@20': [44.59, 42.30, 44.16, 47.98, 49.25, 50.75, 29.85, 46.38, 53.03, 45.72, 48.36, 48.60],
|
160 |
+
# 'STARK-MAG_MRR': [35.97, 21.82, 30.00, 28.70, 38.55, 42.90, 26.06, 48.25, 36.82, 38.98, 44.15, 49.00],
|
161 |
+
# 'STARK-PRIME_Hit@1': [13.93, 5.00, 6.78, 7.14, 15.36, 12.14, 9.29, 16.79, 15.36, 15.00, 17.79, 18.28],
|
162 |
+
# 'STARK-PRIME_Hit@5': [31.07, 23.57, 16.15, 17.14, 31.07, 31.42, 20.7, 34.29, 32.86, 26.07, 36.90, 37.28],
|
163 |
+
# 'STARK-PRIME_R@20': [32.84, 30.50, 17.07, 32.95, 37.88, 37.34, 25.54, 41.11, 40.99, 27.78, 35.57, 34.05],
|
164 |
+
# 'STARK-PRIME_MRR': [21.68, 13.50, 11.42, 16.27, 23.50, 21.23, 15.00, 24.99, 23.70, 19.98, 26.27, 26.55]
|
165 |
+
# }
|
166 |
+
|
167 |
+
# data_human_generated = {
|
168 |
+
# 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
|
169 |
+
# 'STARK-AMAZON_Hit@1': [27.16, 16.05, 25.93, 22.22, 39.50, 35.80, 29.63, 40.74, 46.91, 33.33, 53.09, 50.62],
|
170 |
+
# 'STARK-AMAZON_Hit@5': [51.85, 39.51, 54.32, 49.38, 64.19, 62.96, 46.91, 71.60, 72.84, 55.56, 74.07, 75.31],
|
171 |
+
# 'STARK-AMAZON_R@20': [29.23, 15.23, 23.69, 21.54, 35.46, 33.01, 21.21, 36.30, 40.22, 29.03, 35.46, 35.46],
|
172 |
+
# 'STARK-AMAZON_MRR': [18.79, 27.21, 37.12, 31.33, 52.65, 47.84, 38.61, 53.21, 58.74, 43.77, 62.11, 61.06],
|
173 |
+
# 'STARK-MAG_Hit@1': [32.14, 4.72, 25.00, 20.24, 28.57, 22.62, 16.67, 34.52, 23.81, 33.33, 38.10, 36.90],
|
174 |
+
# 'STARK-MAG_Hit@5': [41.67, 9.52, 30.95, 26.19, 41.67, 36.90, 28.57, 44.04, 41.67, 36.90, 45.24, 46.43],
|
175 |
+
# 'STARK-MAG_R@20': [32.46, 25.00, 27.24, 28.76, 35.95, 32.44, 21.74, 34.57, 39.85, 30.50, 35.95, 35.95],
|
176 |
+
# 'STARK-MAG_MRR': [37.42, 7.90, 27.98, 25.53, 35.81, 29.68, 21.59, 38.72, 31.43, 35.97, 42.00, 40.65],
|
177 |
+
# 'STARK-PRIME_Hit@1': [22.45, 2.04, 7.14, 6.12, 17.35, 16.33, 9.18, 25.51, 24.49, 15.31, 28.57, 28.57],
|
178 |
+
# 'STARK-PRIME_Hit@5': [41.84, 9.18, 13.27, 13.27, 34.69, 32.65, 21.43, 41.84, 39.80, 26.53, 46.94, 44.90],
|
179 |
+
# 'STARK-PRIME_R@20': [42.32, 10.69, 11.72, 17.62, 41.09, 39.01, 26.77, 48.10, 47.21, 25.56, 41.61, 41.61],
|
180 |
+
# 'STARK-PRIME_MRR': [30.37, 7.05, 10.07, 9.39, 26.35, 24.33, 15.24, 34.28, 32.98, 19.67, 36.32, 34.82]
|
181 |
+
# }
|
182 |
|
183 |
# Initialize DataFrames
|
184 |
df_synthesized_full = pd.DataFrame(data_synthesized_full)
|
185 |
+
# df_synthesized_10 = pd.DataFrame(data_synthesized_10)
|
186 |
+
# df_human_generated = pd.DataFrame(data_human_generated)
|
187 |
|
188 |
# Model type definitions
|
189 |
model_types = {
|
|
|
367 |
"""
|
368 |
Initialize the leaderboard with baseline results and submitted results.
|
369 |
"""
|
370 |
+
# global df_synthesized_full, df_synthesized_10, df_human_generated
|
371 |
+
global df_synthesized_full
|
372 |
|
373 |
try:
|
374 |
# First, initialize with baseline results
|
375 |
df_synthesized_full = pd.DataFrame(data_synthesized_full)
|
376 |
+
# df_synthesized_10 = pd.DataFrame(data_synthesized_10)
|
377 |
+
# df_human_generated = pd.DataFrame(data_human_generated)
|
378 |
|
379 |
print("Initialized with baseline results")
|
380 |
|
|
|
787 |
except Exception as e:
|
788 |
print(f"Warning: Failed to delete temporary file {temp_file}: {str(e)}")
|
789 |
|
|
|
790 |
def filter_by_model_type(df, selected_types):
|
791 |
"""
|
792 |
Filter DataFrame by selected model types, including submitted models.
|
|
|
815 |
Include all models from selected categories.
|
816 |
"""
|
817 |
if not selected_types:
|
818 |
+
# return [df.head(0) for df in [df_synthesized_full, df_synthesized_10, df_human_generated]]
|
819 |
+
return [df.head(0) for df in [df_synthesized_full]]
|
820 |
|
821 |
filtered_df_full = filter_by_model_type(df_synthesized_full, selected_types)
|
822 |
+
# filtered_df_10 = filter_by_model_type(df_synthesized_10, selected_types)
|
823 |
+
# filtered_df_human = filter_by_model_type(df_human_generated, selected_types)
|
824 |
|
825 |
outputs = []
|
826 |
+
# for df in [filtered_df_full, filtered_df_10, filtered_df_human]:
|
827 |
+
for df in [filtered_df_full]:
|
828 |
+
for dataset in ['DL2023', 'MAG', 'PRIME']:
|
829 |
+
outputs.append(format_dataframe(df, f"LLMJudge-{dataset}"))
|
830 |
|
831 |
return outputs
|
832 |
|
833 |
+
def load_test_data():
|
834 |
+
# Split the text into a list
|
835 |
+
test_data = os.getenv('LLMJudgeTest').split()
|
836 |
+
# Reshape the list into a 2D array where each row contains 4 elements
|
837 |
+
test_data = [test_data[i:i+4] for i in range(0, len(test_data), 4)]
|
838 |
+
# Create a DataFrame
|
839 |
+
test_data = pd.DataFrame(test_data, columns=['qid', 'Q0', 'pid', 'score'])
|
840 |
+
|
841 |
+
return test_data
|
842 |
+
|
843 |
css = """
|
844 |
table > thead {
|
845 |
white-space: normal
|
|
|
861 |
|
862 |
# Main application
|
863 |
with gr.Blocks(css=css) as demo:
|
864 |
+
gr.Markdown("# LLM-as-a-Rel: Automatic Relevance Judgment Leaderboard")
|
865 |
+
gr.Markdown("Refer to the [LLMJudge overview paper](https://arxiv.org/pdf/2408.08896) for details on metrics, tasks and models.")
|
866 |
|
867 |
# Initialize leaderboard at startup
|
868 |
print("Starting leaderboard initialization...")
|
|
|
902 |
|
903 |
# Submission section
|
904 |
|
905 |
+
# load test data
|
906 |
+
test_data = load_test_data()
|
|
|
|
|
|
|
|
|
|
|
|
|
907 |
|
908 |
gr.Markdown("---")
|
909 |
gr.Markdown("## Submit Your Results:")
|
910 |
gr.Markdown("""
|
911 |
Submit your results to be included in the leaderboard. Please ensure your submission meets all requirements.
|
912 |
+
For questions, contact hossein.rahmani.22@ucl.ac.uk. Detailed instructions can be referred at [submission instructions](https://docs.google.com/document/d/11coGjTmOEi9p9-PUq1oy0eTOj8f_8CVQhDl5_0FKT14/edit?usp=sharing).
|
913 |
""")
|
914 |
|
915 |
with gr.Row():
|
|
|
959 |
placeholder="e.g., 4x NVIDIA A100 80GB"
|
960 |
)
|
961 |
csv_file = gr.File(
|
962 |
+
label="Prediction TXT*",
|
963 |
+
file_types=[".txt"],
|
964 |
type="filepath"
|
965 |
)
|
966 |
paper_link = gr.Textbox(
|
|
|
971 |
submit_btn = gr.Button("Submit", variant="primary")
|
972 |
result = gr.Textbox(label="Submission Status", interactive=False)
|
973 |
|
|
|
974 |
# Set up event handlers
|
975 |
model_type_filter.change(
|
976 |
update_tables,
|
|
|
998 |
inputs=[model_type_filter],
|
999 |
outputs=all_dfs
|
1000 |
)
|
|
|
1001 |
|
1002 |
# Launch the application
|
1003 |
demo.launch()
|