Spaces:
Sleeping
Sleeping
Commit
·
43dd1e4
1
Parent(s):
0d282c0
upgrade leaderboard
Browse files
app.py
CHANGED
|
@@ -14,6 +14,8 @@ from email.mime.text import MIMEText
|
|
| 14 |
from huggingface_hub import HfApi
|
| 15 |
import shutil
|
| 16 |
import tempfile
|
|
|
|
|
|
|
| 17 |
|
| 18 |
from stark_qa import load_qa
|
| 19 |
from stark_qa.evaluator import Evaluator
|
|
@@ -23,7 +25,7 @@ from utils.token_handler import TokenHandler
|
|
| 23 |
|
| 24 |
# Initialize storage once at startup
|
| 25 |
try:
|
| 26 |
-
REPO_ID = "
|
| 27 |
hub_storage = HubStorage(REPO_ID)
|
| 28 |
except Exception as e:
|
| 29 |
raise RuntimeError(f"Failed to initialize HuggingFace Hub storage: {e}")
|
|
@@ -56,50 +58,63 @@ def process_single_instance(args):
|
|
| 56 |
result["idx"], result["query_id"] = idx, query_id
|
| 57 |
return result
|
| 58 |
|
| 59 |
-
|
| 60 |
def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int = 4):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
candidate_ids_dict = {
|
| 62 |
'amazon': [i for i in range(957192)],
|
| 63 |
'mag': [i for i in range(1172724, 1872968)],
|
| 64 |
'prime': [i for i in range(129375)]
|
| 65 |
}
|
| 66 |
try:
|
| 67 |
-
eval_csv = pd.read_csv(csv_path)
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
# Prepare args for each worker
|
| 90 |
-
args = [(idx, eval_csv, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
|
| 91 |
|
| 92 |
-
with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
|
| 99 |
# Concatenate results and compute final metrics
|
| 100 |
-
eval_csv = pd.concat([eval_csv, pd.DataFrame(results_list)], ignore_index=True)
|
| 101 |
final_results = {
|
| 102 |
-
metric: np.mean(eval_csv[eval_csv['query_id'].isin(query_ids)][metric]) for metric in eval_metrics
|
|
|
|
|
|
|
| 103 |
}
|
| 104 |
return final_results
|
| 105 |
|
|
@@ -110,60 +125,65 @@ def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int =
|
|
| 110 |
except Exception as error:
|
| 111 |
return f"{error}"
|
| 112 |
|
| 113 |
-
|
| 114 |
# Data dictionaries for leaderboard
|
| 115 |
data_synthesized_full = {
|
| 116 |
-
'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)'
|
| 117 |
-
'
|
| 118 |
-
'
|
| 119 |
-
'STARK-AMAZON_R@20': [53.77, 44.49, 41.95, 52.05, 53.29, 54.28, 33.22, 56.52, 55.12, 53.44],
|
| 120 |
-
'STARK-AMAZON_MRR': [55.30, 30.20, 40.66, 37.75, 50.35, 51.60, 31.47, 53.46, 51.55, 55.51],
|
| 121 |
-
'STARK-MAG_Hit@1': [25.85, 10.51, 21.96, 12.88, 29.08, 30.06, 18.01, 37.90, 25.92, 31.18],
|
| 122 |
-
'STARK-MAG_Hit@5': [45.25, 35.23, 36.50, 39.01, 49.61, 50.58, 34.85, 56.74, 50.43, 46.42],
|
| 123 |
-
'STARK-MAG_R@20': [45.69, 42.11, 35.32, 46.97, 48.36, 50.49, 35.46, 46.40, 50.80, 43.94],
|
| 124 |
-
'STARK-MAG_MRR': [34.91, 21.34, 29.14, 29.12, 38.62, 39.66, 26.10, 47.25, 36.94, 38.39],
|
| 125 |
-
'STARK-PRIME_Hit@1': [12.75, 4.46, 6.53, 8.85, 12.63, 10.85, 10.10, 15.57, 15.10, 11.75],
|
| 126 |
-
'STARK-PRIME_Hit@5': [27.92, 21.85, 15.67, 21.35, 31.49, 30.23, 22.49, 33.42, 33.56, 23.85],
|
| 127 |
-
'STARK-PRIME_R@20': [31.25, 30.13, 16.52, 29.63, 36.00, 37.83, 26.34, 39.09, 38.05, 25.04],
|
| 128 |
-
'STARK-PRIME_MRR': [19.84, 12.38, 11.05, 14.73, 21.41, 19.99, 16.12, 24.11, 23.49, 17.39]
|
| 129 |
}
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
}
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
# Initialize DataFrames
|
| 164 |
df_synthesized_full = pd.DataFrame(data_synthesized_full)
|
| 165 |
-
df_synthesized_10 = pd.DataFrame(data_synthesized_10)
|
| 166 |
-
df_human_generated = pd.DataFrame(data_human_generated)
|
| 167 |
|
| 168 |
# Model type definitions
|
| 169 |
model_types = {
|
|
@@ -347,13 +367,14 @@ def initialize_leaderboard():
|
|
| 347 |
"""
|
| 348 |
Initialize the leaderboard with baseline results and submitted results.
|
| 349 |
"""
|
| 350 |
-
global df_synthesized_full, df_synthesized_10, df_human_generated
|
|
|
|
| 351 |
|
| 352 |
try:
|
| 353 |
# First, initialize with baseline results
|
| 354 |
df_synthesized_full = pd.DataFrame(data_synthesized_full)
|
| 355 |
-
df_synthesized_10 = pd.DataFrame(data_synthesized_10)
|
| 356 |
-
df_human_generated = pd.DataFrame(data_human_generated)
|
| 357 |
|
| 358 |
print("Initialized with baseline results")
|
| 359 |
|
|
@@ -766,7 +787,6 @@ def process_submission(method_name, team_name, dataset, split, contact_email, co
|
|
| 766 |
except Exception as e:
|
| 767 |
print(f"Warning: Failed to delete temporary file {temp_file}: {str(e)}")
|
| 768 |
|
| 769 |
-
|
| 770 |
def filter_by_model_type(df, selected_types):
|
| 771 |
"""
|
| 772 |
Filter DataFrame by selected model types, including submitted models.
|
|
@@ -795,19 +815,31 @@ def update_tables(selected_types):
|
|
| 795 |
Include all models from selected categories.
|
| 796 |
"""
|
| 797 |
if not selected_types:
|
| 798 |
-
return [df.head(0) for df in [df_synthesized_full, df_synthesized_10, df_human_generated]]
|
|
|
|
| 799 |
|
| 800 |
filtered_df_full = filter_by_model_type(df_synthesized_full, selected_types)
|
| 801 |
-
filtered_df_10 = filter_by_model_type(df_synthesized_10, selected_types)
|
| 802 |
-
filtered_df_human = filter_by_model_type(df_human_generated, selected_types)
|
| 803 |
|
| 804 |
outputs = []
|
| 805 |
-
for df in [filtered_df_full, filtered_df_10, filtered_df_human]:
|
| 806 |
-
|
| 807 |
-
|
|
|
|
| 808 |
|
| 809 |
return outputs
|
| 810 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 811 |
css = """
|
| 812 |
table > thead {
|
| 813 |
white-space: normal
|
|
@@ -829,8 +861,8 @@ table > tbody > tr > td:nth-child(2) > div {
|
|
| 829 |
|
| 830 |
# Main application
|
| 831 |
with gr.Blocks(css=css) as demo:
|
| 832 |
-
gr.Markdown("#
|
| 833 |
-
gr.Markdown("Refer to the [
|
| 834 |
|
| 835 |
# Initialize leaderboard at startup
|
| 836 |
print("Starting leaderboard initialization...")
|
|
@@ -870,20 +902,14 @@ with gr.Blocks(css=css) as demo:
|
|
| 870 |
|
| 871 |
# Submission section
|
| 872 |
|
| 873 |
-
#
|
| 874 |
-
test_data =
|
| 875 |
-
# Reshape the list into a 2D array where each row contains 4 elements
|
| 876 |
-
test_data = [test_data[i:i+4] for i in range(0, len(test_data), 4)]
|
| 877 |
-
# Create a DataFrame
|
| 878 |
-
df = pd.DataFrame(test_data, columns=['qid', 'Q0', 'pid', 'score'])
|
| 879 |
-
# Display the DataFrame
|
| 880 |
-
print(df)
|
| 881 |
|
| 882 |
gr.Markdown("---")
|
| 883 |
gr.Markdown("## Submit Your Results:")
|
| 884 |
gr.Markdown("""
|
| 885 |
Submit your results to be included in the leaderboard. Please ensure your submission meets all requirements.
|
| 886 |
-
For questions, contact
|
| 887 |
""")
|
| 888 |
|
| 889 |
with gr.Row():
|
|
@@ -933,8 +959,8 @@ with gr.Blocks(css=css) as demo:
|
|
| 933 |
placeholder="e.g., 4x NVIDIA A100 80GB"
|
| 934 |
)
|
| 935 |
csv_file = gr.File(
|
| 936 |
-
label="Prediction
|
| 937 |
-
file_types=[".
|
| 938 |
type="filepath"
|
| 939 |
)
|
| 940 |
paper_link = gr.Textbox(
|
|
@@ -945,7 +971,6 @@ with gr.Blocks(css=css) as demo:
|
|
| 945 |
submit_btn = gr.Button("Submit", variant="primary")
|
| 946 |
result = gr.Textbox(label="Submission Status", interactive=False)
|
| 947 |
|
| 948 |
-
|
| 949 |
# Set up event handlers
|
| 950 |
model_type_filter.change(
|
| 951 |
update_tables,
|
|
@@ -973,7 +998,6 @@ with gr.Blocks(css=css) as demo:
|
|
| 973 |
inputs=[model_type_filter],
|
| 974 |
outputs=all_dfs
|
| 975 |
)
|
| 976 |
-
|
| 977 |
|
| 978 |
# Launch the application
|
| 979 |
demo.launch()
|
|
|
|
| 14 |
from huggingface_hub import HfApi
|
| 15 |
import shutil
|
| 16 |
import tempfile
|
| 17 |
+
from sklearn.metrics import cohen_kappa_score
|
| 18 |
+
import krippendorff
|
| 19 |
|
| 20 |
from stark_qa import load_qa
|
| 21 |
from stark_qa.evaluator import Evaluator
|
|
|
|
| 25 |
|
| 26 |
# Initialize storage once at startup
|
| 27 |
try:
|
| 28 |
+
REPO_ID = "rahmanidashti/llm-as-a-rel" # Replace with your space name
|
| 29 |
hub_storage = HubStorage(REPO_ID)
|
| 30 |
except Exception as e:
|
| 31 |
raise RuntimeError(f"Failed to initialize HuggingFace Hub storage: {e}")
|
|
|
|
| 58 |
result["idx"], result["query_id"] = idx, query_id
|
| 59 |
return result
|
| 60 |
|
|
|
|
| 61 |
def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int = 4):
|
| 62 |
+
"""
|
| 63 |
+
computing the metrics for the evaluation.
|
| 64 |
+
|
| 65 |
+
Parameters:
|
| 66 |
+
csv_path (str): The path to the submission file for evaluation.
|
| 67 |
+
"""
|
| 68 |
candidate_ids_dict = {
|
| 69 |
'amazon': [i for i in range(957192)],
|
| 70 |
'mag': [i for i in range(1172724, 1872968)],
|
| 71 |
'prime': [i for i in range(129375)]
|
| 72 |
}
|
| 73 |
try:
|
| 74 |
+
# eval_csv = pd.read_csv(csv_path)
|
| 75 |
+
eval_csv = pd.read_csv(csv_path, sep=" ", header=None, names=['qid', 'Q0', 'docid', 'score'])
|
| 76 |
+
eval_csv['score'] = [0 if x < 0 else 3 if x > 3 else x for x in eval_csv['score']]
|
| 77 |
+
test_eval_df = pd.merge(test_data, eval_csv, on=['qid', 'docid'], how='outer')
|
| 78 |
+
cohen_kappa = cohen_kappa_score(test_eval_df['score_x'], test_eval_df['score_y'])
|
| 79 |
+
krippendorff_alpha = krippendorff.alpha(reliability_data=[test_eval_df['score_x'], test_eval_df['score_y']], value_domain=[0,1,2,3], level_of_measurement='ordinal')
|
| 80 |
+
|
| 81 |
+
# if 'query_id' not in eval_csv.columns:
|
| 82 |
+
# raise ValueError('No `query_id` column found in the submitted csv.')
|
| 83 |
+
# if 'pred_rank' not in eval_csv.columns:
|
| 84 |
+
# raise ValueError('No `pred_rank` column found in the submitted csv.')
|
| 85 |
+
|
| 86 |
+
# eval_csv = eval_csv[['query_id', 'pred_rank']]
|
| 87 |
+
|
| 88 |
+
# if dataset not in candidate_ids_dict:
|
| 89 |
+
# raise ValueError(f"Invalid dataset '{dataset}', expected one of {list(candidate_ids_dict.keys())}.")
|
| 90 |
+
# if split not in ['test', 'test-0.1', 'human_generated_eval']:
|
| 91 |
+
# raise ValueError(f"Invalid split '{split}', expected one of ['test', 'test-0.1', 'human_generated_eval'].")
|
| 92 |
+
|
| 93 |
+
# evaluator = Evaluator(candidate_ids_dict[dataset])
|
| 94 |
+
# eval_metrics = ['hit@1', 'hit@5', 'recall@20', 'mrr']
|
| 95 |
+
# qa_dataset = load_qa(dataset, human_generated_eval=split == 'human_generated_eval')
|
| 96 |
+
# split_idx = qa_dataset.get_idx_split()
|
| 97 |
+
# all_indices = split_idx[split].tolist()
|
| 98 |
+
|
| 99 |
+
# results_list = []
|
| 100 |
+
# query_ids = []
|
| 101 |
|
| 102 |
# Prepare args for each worker
|
| 103 |
+
# args = [(idx, eval_csv, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
|
| 104 |
|
| 105 |
+
# with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
| 106 |
+
# futures = [executor.submit(process_single_instance, arg) for arg in args]
|
| 107 |
+
# for future in tqdm(as_completed(futures), total=len(futures)):
|
| 108 |
+
# result = future.result() # This will raise an error if the worker encountered one
|
| 109 |
+
# results_list.append(result)
|
| 110 |
+
# query_ids.append(result['query_id'])
|
| 111 |
|
| 112 |
# Concatenate results and compute final metrics
|
| 113 |
+
# eval_csv = pd.concat([eval_csv, pd.DataFrame(results_list)], ignore_index=True)
|
| 114 |
final_results = {
|
| 115 |
+
# metric: np.mean(eval_csv[eval_csv['query_id'].isin(query_ids)][metric]) for metric in eval_metrics
|
| 116 |
+
'kappa': round(cohen_kappa, 4),
|
| 117 |
+
'alpha': round(krippendorff_alpha, 4)
|
| 118 |
}
|
| 119 |
return final_results
|
| 120 |
|
|
|
|
| 125 |
except Exception as error:
|
| 126 |
return f"{error}"
|
| 127 |
|
|
|
|
| 128 |
# Data dictionaries for leaderboard
|
| 129 |
data_synthesized_full = {
|
| 130 |
+
'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)'],
|
| 131 |
+
'LLMJudge-DL2023_Kappa': [44.94, 15.29, 30.96],
|
| 132 |
+
'LLMJudge-DL2023_Alpha': [67.42, 47.93, 51.06],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
}
|
| 134 |
|
| 135 |
+
# data_synthesized_full = {
|
| 136 |
+
# 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2'],
|
| 137 |
+
# 'STARK-AMAZON_Hit@1': [44.94, 15.29, 30.96, 26.56, 39.16, 40.93, 21.74, 42.08, 40.07, 46.10],
|
| 138 |
+
# 'STARK-AMAZON_Hit@5': [67.42, 47.93, 51.06, 50.01, 62.73, 64.37, 41.65, 66.87, 64.98, 66.02],
|
| 139 |
+
# 'STARK-AMAZON_R@20': [53.77, 44.49, 41.95, 52.05, 53.29, 54.28, 33.22, 56.52, 55.12, 53.44],
|
| 140 |
+
# 'STARK-AMAZON_MRR': [55.30, 30.20, 40.66, 37.75, 50.35, 51.60, 31.47, 53.46, 51.55, 55.51],
|
| 141 |
+
# 'STARK-MAG_Hit@1': [25.85, 10.51, 21.96, 12.88, 29.08, 30.06, 18.01, 37.90, 25.92, 31.18],
|
| 142 |
+
# 'STARK-MAG_Hit@5': [45.25, 35.23, 36.50, 39.01, 49.61, 50.58, 34.85, 56.74, 50.43, 46.42],
|
| 143 |
+
# 'STARK-MAG_R@20': [45.69, 42.11, 35.32, 46.97, 48.36, 50.49, 35.46, 46.40, 50.80, 43.94],
|
| 144 |
+
# 'STARK-MAG_MRR': [34.91, 21.34, 29.14, 29.12, 38.62, 39.66, 26.10, 47.25, 36.94, 38.39],
|
| 145 |
+
# 'STARK-PRIME_Hit@1': [12.75, 4.46, 6.53, 8.85, 12.63, 10.85, 10.10, 15.57, 15.10, 11.75],
|
| 146 |
+
# 'STARK-PRIME_Hit@5': [27.92, 21.85, 15.67, 21.35, 31.49, 30.23, 22.49, 33.42, 33.56, 23.85],
|
| 147 |
+
# 'STARK-PRIME_R@20': [31.25, 30.13, 16.52, 29.63, 36.00, 37.83, 26.34, 39.09, 38.05, 25.04],
|
| 148 |
+
# 'STARK-PRIME_MRR': [19.84, 12.38, 11.05, 14.73, 21.41, 19.99, 16.12, 24.11, 23.49, 17.39]
|
| 149 |
+
# }
|
| 150 |
+
|
| 151 |
+
# data_synthesized_10 = {
|
| 152 |
+
# 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
|
| 153 |
+
# 'STARK-AMAZON_Hit@1': [42.68, 16.46, 30.09, 25.00, 39.02, 43.29, 18.90, 43.29, 40.85, 44.31, 45.49, 44.79],
|
| 154 |
+
# 'STARK-AMAZON_Hit@5': [67.07, 50.00, 49.27, 48.17, 64.02, 67.68, 37.80, 71.34, 62.80, 65.24, 71.13, 71.17],
|
| 155 |
+
# 'STARK-AMAZON_R@20': [54.48, 42.15, 41.91, 51.65, 49.30, 56.04, 34.73, 56.14, 52.47, 51.00, 53.77, 55.35],
|
| 156 |
+
# 'STARK-AMAZON_MRR': [54.02, 30.20, 39.30, 36.87, 50.32, 54.20, 28.76, 55.07, 51.54, 55.07, 55.91, 55.69],
|
| 157 |
+
# 'STARK-MAG_Hit@1': [27.81, 11.65, 22.89, 12.03, 28.20, 34.59, 19.17, 38.35, 25.56, 31.58, 36.54, 40.90],
|
| 158 |
+
# 'STARK-MAG_Hit@5': [45.48, 36.84, 37.26, 37.97, 52.63, 50.75, 33.46, 58.64, 50.37, 47.36, 53.17, 58.18],
|
| 159 |
+
# 'STARK-MAG_R@20': [44.59, 42.30, 44.16, 47.98, 49.25, 50.75, 29.85, 46.38, 53.03, 45.72, 48.36, 48.60],
|
| 160 |
+
# 'STARK-MAG_MRR': [35.97, 21.82, 30.00, 28.70, 38.55, 42.90, 26.06, 48.25, 36.82, 38.98, 44.15, 49.00],
|
| 161 |
+
# 'STARK-PRIME_Hit@1': [13.93, 5.00, 6.78, 7.14, 15.36, 12.14, 9.29, 16.79, 15.36, 15.00, 17.79, 18.28],
|
| 162 |
+
# 'STARK-PRIME_Hit@5': [31.07, 23.57, 16.15, 17.14, 31.07, 31.42, 20.7, 34.29, 32.86, 26.07, 36.90, 37.28],
|
| 163 |
+
# 'STARK-PRIME_R@20': [32.84, 30.50, 17.07, 32.95, 37.88, 37.34, 25.54, 41.11, 40.99, 27.78, 35.57, 34.05],
|
| 164 |
+
# 'STARK-PRIME_MRR': [21.68, 13.50, 11.42, 16.27, 23.50, 21.23, 15.00, 24.99, 23.70, 19.98, 26.27, 26.55]
|
| 165 |
+
# }
|
| 166 |
+
|
| 167 |
+
# data_human_generated = {
|
| 168 |
+
# 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
|
| 169 |
+
# 'STARK-AMAZON_Hit@1': [27.16, 16.05, 25.93, 22.22, 39.50, 35.80, 29.63, 40.74, 46.91, 33.33, 53.09, 50.62],
|
| 170 |
+
# 'STARK-AMAZON_Hit@5': [51.85, 39.51, 54.32, 49.38, 64.19, 62.96, 46.91, 71.60, 72.84, 55.56, 74.07, 75.31],
|
| 171 |
+
# 'STARK-AMAZON_R@20': [29.23, 15.23, 23.69, 21.54, 35.46, 33.01, 21.21, 36.30, 40.22, 29.03, 35.46, 35.46],
|
| 172 |
+
# 'STARK-AMAZON_MRR': [18.79, 27.21, 37.12, 31.33, 52.65, 47.84, 38.61, 53.21, 58.74, 43.77, 62.11, 61.06],
|
| 173 |
+
# 'STARK-MAG_Hit@1': [32.14, 4.72, 25.00, 20.24, 28.57, 22.62, 16.67, 34.52, 23.81, 33.33, 38.10, 36.90],
|
| 174 |
+
# 'STARK-MAG_Hit@5': [41.67, 9.52, 30.95, 26.19, 41.67, 36.90, 28.57, 44.04, 41.67, 36.90, 45.24, 46.43],
|
| 175 |
+
# 'STARK-MAG_R@20': [32.46, 25.00, 27.24, 28.76, 35.95, 32.44, 21.74, 34.57, 39.85, 30.50, 35.95, 35.95],
|
| 176 |
+
# 'STARK-MAG_MRR': [37.42, 7.90, 27.98, 25.53, 35.81, 29.68, 21.59, 38.72, 31.43, 35.97, 42.00, 40.65],
|
| 177 |
+
# 'STARK-PRIME_Hit@1': [22.45, 2.04, 7.14, 6.12, 17.35, 16.33, 9.18, 25.51, 24.49, 15.31, 28.57, 28.57],
|
| 178 |
+
# 'STARK-PRIME_Hit@5': [41.84, 9.18, 13.27, 13.27, 34.69, 32.65, 21.43, 41.84, 39.80, 26.53, 46.94, 44.90],
|
| 179 |
+
# 'STARK-PRIME_R@20': [42.32, 10.69, 11.72, 17.62, 41.09, 39.01, 26.77, 48.10, 47.21, 25.56, 41.61, 41.61],
|
| 180 |
+
# 'STARK-PRIME_MRR': [30.37, 7.05, 10.07, 9.39, 26.35, 24.33, 15.24, 34.28, 32.98, 19.67, 36.32, 34.82]
|
| 181 |
+
# }
|
| 182 |
|
| 183 |
# Initialize DataFrames
|
| 184 |
df_synthesized_full = pd.DataFrame(data_synthesized_full)
|
| 185 |
+
# df_synthesized_10 = pd.DataFrame(data_synthesized_10)
|
| 186 |
+
# df_human_generated = pd.DataFrame(data_human_generated)
|
| 187 |
|
| 188 |
# Model type definitions
|
| 189 |
model_types = {
|
|
|
|
| 367 |
"""
|
| 368 |
Initialize the leaderboard with baseline results and submitted results.
|
| 369 |
"""
|
| 370 |
+
# global df_synthesized_full, df_synthesized_10, df_human_generated
|
| 371 |
+
global df_synthesized_full
|
| 372 |
|
| 373 |
try:
|
| 374 |
# First, initialize with baseline results
|
| 375 |
df_synthesized_full = pd.DataFrame(data_synthesized_full)
|
| 376 |
+
# df_synthesized_10 = pd.DataFrame(data_synthesized_10)
|
| 377 |
+
# df_human_generated = pd.DataFrame(data_human_generated)
|
| 378 |
|
| 379 |
print("Initialized with baseline results")
|
| 380 |
|
|
|
|
| 787 |
except Exception as e:
|
| 788 |
print(f"Warning: Failed to delete temporary file {temp_file}: {str(e)}")
|
| 789 |
|
|
|
|
| 790 |
def filter_by_model_type(df, selected_types):
|
| 791 |
"""
|
| 792 |
Filter DataFrame by selected model types, including submitted models.
|
|
|
|
| 815 |
Include all models from selected categories.
|
| 816 |
"""
|
| 817 |
if not selected_types:
|
| 818 |
+
# return [df.head(0) for df in [df_synthesized_full, df_synthesized_10, df_human_generated]]
|
| 819 |
+
return [df.head(0) for df in [df_synthesized_full]]
|
| 820 |
|
| 821 |
filtered_df_full = filter_by_model_type(df_synthesized_full, selected_types)
|
| 822 |
+
# filtered_df_10 = filter_by_model_type(df_synthesized_10, selected_types)
|
| 823 |
+
# filtered_df_human = filter_by_model_type(df_human_generated, selected_types)
|
| 824 |
|
| 825 |
outputs = []
|
| 826 |
+
# for df in [filtered_df_full, filtered_df_10, filtered_df_human]:
|
| 827 |
+
for df in [filtered_df_full]:
|
| 828 |
+
for dataset in ['DL2023', 'MAG', 'PRIME']:
|
| 829 |
+
outputs.append(format_dataframe(df, f"LLMJudge-{dataset}"))
|
| 830 |
|
| 831 |
return outputs
|
| 832 |
|
| 833 |
+
def load_test_data():
|
| 834 |
+
# Split the text into a list
|
| 835 |
+
test_data = os.getenv('LLMJudgeTest').split()
|
| 836 |
+
# Reshape the list into a 2D array where each row contains 4 elements
|
| 837 |
+
test_data = [test_data[i:i+4] for i in range(0, len(test_data), 4)]
|
| 838 |
+
# Create a DataFrame
|
| 839 |
+
test_data = pd.DataFrame(test_data, columns=['qid', 'Q0', 'pid', 'score'])
|
| 840 |
+
|
| 841 |
+
return test_data
|
| 842 |
+
|
| 843 |
css = """
|
| 844 |
table > thead {
|
| 845 |
white-space: normal
|
|
|
|
| 861 |
|
| 862 |
# Main application
|
| 863 |
with gr.Blocks(css=css) as demo:
|
| 864 |
+
gr.Markdown("# LLM-as-a-Rel: Automatic Relevance Judgment Leaderboard")
|
| 865 |
+
gr.Markdown("Refer to the [LLMJudge overview paper](https://arxiv.org/pdf/2408.08896) for details on metrics, tasks and models.")
|
| 866 |
|
| 867 |
# Initialize leaderboard at startup
|
| 868 |
print("Starting leaderboard initialization...")
|
|
|
|
| 902 |
|
| 903 |
# Submission section
|
| 904 |
|
| 905 |
+
# load test data
|
| 906 |
+
test_data = load_test_data()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 907 |
|
| 908 |
gr.Markdown("---")
|
| 909 |
gr.Markdown("## Submit Your Results:")
|
| 910 |
gr.Markdown("""
|
| 911 |
Submit your results to be included in the leaderboard. Please ensure your submission meets all requirements.
|
| 912 |
+
For questions, contact hossein.rahmani.22@ucl.ac.uk. Detailed instructions can be referred at [submission instructions](https://docs.google.com/document/d/11coGjTmOEi9p9-PUq1oy0eTOj8f_8CVQhDl5_0FKT14/edit?usp=sharing).
|
| 913 |
""")
|
| 914 |
|
| 915 |
with gr.Row():
|
|
|
|
| 959 |
placeholder="e.g., 4x NVIDIA A100 80GB"
|
| 960 |
)
|
| 961 |
csv_file = gr.File(
|
| 962 |
+
label="Prediction TXT*",
|
| 963 |
+
file_types=[".txt"],
|
| 964 |
type="filepath"
|
| 965 |
)
|
| 966 |
paper_link = gr.Textbox(
|
|
|
|
| 971 |
submit_btn = gr.Button("Submit", variant="primary")
|
| 972 |
result = gr.Textbox(label="Submission Status", interactive=False)
|
| 973 |
|
|
|
|
| 974 |
# Set up event handlers
|
| 975 |
model_type_filter.change(
|
| 976 |
update_tables,
|
|
|
|
| 998 |
inputs=[model_type_filter],
|
| 999 |
outputs=all_dfs
|
| 1000 |
)
|
|
|
|
| 1001 |
|
| 1002 |
# Launch the application
|
| 1003 |
demo.launch()
|