rahmanidashti commited on
Commit
43dd1e4
·
1 Parent(s): 0d282c0

upgrade leaderboard

Browse files
Files changed (1) hide show
  1. app.py +128 -104
app.py CHANGED
@@ -14,6 +14,8 @@ from email.mime.text import MIMEText
14
  from huggingface_hub import HfApi
15
  import shutil
16
  import tempfile
 
 
17
 
18
  from stark_qa import load_qa
19
  from stark_qa.evaluator import Evaluator
@@ -23,7 +25,7 @@ from utils.token_handler import TokenHandler
23
 
24
  # Initialize storage once at startup
25
  try:
26
- REPO_ID = "snap-stanford/stark-leaderboard" # Replace with your space name
27
  hub_storage = HubStorage(REPO_ID)
28
  except Exception as e:
29
  raise RuntimeError(f"Failed to initialize HuggingFace Hub storage: {e}")
@@ -56,50 +58,63 @@ def process_single_instance(args):
56
  result["idx"], result["query_id"] = idx, query_id
57
  return result
58
 
59
-
60
  def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int = 4):
 
 
 
 
 
 
61
  candidate_ids_dict = {
62
  'amazon': [i for i in range(957192)],
63
  'mag': [i for i in range(1172724, 1872968)],
64
  'prime': [i for i in range(129375)]
65
  }
66
  try:
67
- eval_csv = pd.read_csv(csv_path)
68
- if 'query_id' not in eval_csv.columns:
69
- raise ValueError('No `query_id` column found in the submitted csv.')
70
- if 'pred_rank' not in eval_csv.columns:
71
- raise ValueError('No `pred_rank` column found in the submitted csv.')
72
-
73
- eval_csv = eval_csv[['query_id', 'pred_rank']]
74
-
75
- if dataset not in candidate_ids_dict:
76
- raise ValueError(f"Invalid dataset '{dataset}', expected one of {list(candidate_ids_dict.keys())}.")
77
- if split not in ['test', 'test-0.1', 'human_generated_eval']:
78
- raise ValueError(f"Invalid split '{split}', expected one of ['test', 'test-0.1', 'human_generated_eval'].")
79
-
80
- evaluator = Evaluator(candidate_ids_dict[dataset])
81
- eval_metrics = ['hit@1', 'hit@5', 'recall@20', 'mrr']
82
- qa_dataset = load_qa(dataset, human_generated_eval=split == 'human_generated_eval')
83
- split_idx = qa_dataset.get_idx_split()
84
- all_indices = split_idx[split].tolist()
85
-
86
- results_list = []
87
- query_ids = []
 
 
 
 
 
 
88
 
89
  # Prepare args for each worker
90
- args = [(idx, eval_csv, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
91
 
92
- with ProcessPoolExecutor(max_workers=num_workers) as executor:
93
- futures = [executor.submit(process_single_instance, arg) for arg in args]
94
- for future in tqdm(as_completed(futures), total=len(futures)):
95
- result = future.result() # This will raise an error if the worker encountered one
96
- results_list.append(result)
97
- query_ids.append(result['query_id'])
98
 
99
  # Concatenate results and compute final metrics
100
- eval_csv = pd.concat([eval_csv, pd.DataFrame(results_list)], ignore_index=True)
101
  final_results = {
102
- metric: np.mean(eval_csv[eval_csv['query_id'].isin(query_ids)][metric]) for metric in eval_metrics
 
 
103
  }
104
  return final_results
105
 
@@ -110,60 +125,65 @@ def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int =
110
  except Exception as error:
111
  return f"{error}"
112
 
113
-
114
  # Data dictionaries for leaderboard
115
  data_synthesized_full = {
116
- 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2'],
117
- 'STARK-AMAZON_Hit@1': [44.94, 15.29, 30.96, 26.56, 39.16, 40.93, 21.74, 42.08, 40.07, 46.10],
118
- 'STARK-AMAZON_Hit@5': [67.42, 47.93, 51.06, 50.01, 62.73, 64.37, 41.65, 66.87, 64.98, 66.02],
119
- 'STARK-AMAZON_R@20': [53.77, 44.49, 41.95, 52.05, 53.29, 54.28, 33.22, 56.52, 55.12, 53.44],
120
- 'STARK-AMAZON_MRR': [55.30, 30.20, 40.66, 37.75, 50.35, 51.60, 31.47, 53.46, 51.55, 55.51],
121
- 'STARK-MAG_Hit@1': [25.85, 10.51, 21.96, 12.88, 29.08, 30.06, 18.01, 37.90, 25.92, 31.18],
122
- 'STARK-MAG_Hit@5': [45.25, 35.23, 36.50, 39.01, 49.61, 50.58, 34.85, 56.74, 50.43, 46.42],
123
- 'STARK-MAG_R@20': [45.69, 42.11, 35.32, 46.97, 48.36, 50.49, 35.46, 46.40, 50.80, 43.94],
124
- 'STARK-MAG_MRR': [34.91, 21.34, 29.14, 29.12, 38.62, 39.66, 26.10, 47.25, 36.94, 38.39],
125
- 'STARK-PRIME_Hit@1': [12.75, 4.46, 6.53, 8.85, 12.63, 10.85, 10.10, 15.57, 15.10, 11.75],
126
- 'STARK-PRIME_Hit@5': [27.92, 21.85, 15.67, 21.35, 31.49, 30.23, 22.49, 33.42, 33.56, 23.85],
127
- 'STARK-PRIME_R@20': [31.25, 30.13, 16.52, 29.63, 36.00, 37.83, 26.34, 39.09, 38.05, 25.04],
128
- 'STARK-PRIME_MRR': [19.84, 12.38, 11.05, 14.73, 21.41, 19.99, 16.12, 24.11, 23.49, 17.39]
129
  }
130
 
131
- data_synthesized_10 = {
132
- 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
133
- 'STARK-AMAZON_Hit@1': [42.68, 16.46, 30.09, 25.00, 39.02, 43.29, 18.90, 43.29, 40.85, 44.31, 45.49, 44.79],
134
- 'STARK-AMAZON_Hit@5': [67.07, 50.00, 49.27, 48.17, 64.02, 67.68, 37.80, 71.34, 62.80, 65.24, 71.13, 71.17],
135
- 'STARK-AMAZON_R@20': [54.48, 42.15, 41.91, 51.65, 49.30, 56.04, 34.73, 56.14, 52.47, 51.00, 53.77, 55.35],
136
- 'STARK-AMAZON_MRR': [54.02, 30.20, 39.30, 36.87, 50.32, 54.20, 28.76, 55.07, 51.54, 55.07, 55.91, 55.69],
137
- 'STARK-MAG_Hit@1': [27.81, 11.65, 22.89, 12.03, 28.20, 34.59, 19.17, 38.35, 25.56, 31.58, 36.54, 40.90],
138
- 'STARK-MAG_Hit@5': [45.48, 36.84, 37.26, 37.97, 52.63, 50.75, 33.46, 58.64, 50.37, 47.36, 53.17, 58.18],
139
- 'STARK-MAG_R@20': [44.59, 42.30, 44.16, 47.98, 49.25, 50.75, 29.85, 46.38, 53.03, 45.72, 48.36, 48.60],
140
- 'STARK-MAG_MRR': [35.97, 21.82, 30.00, 28.70, 38.55, 42.90, 26.06, 48.25, 36.82, 38.98, 44.15, 49.00],
141
- 'STARK-PRIME_Hit@1': [13.93, 5.00, 6.78, 7.14, 15.36, 12.14, 9.29, 16.79, 15.36, 15.00, 17.79, 18.28],
142
- 'STARK-PRIME_Hit@5': [31.07, 23.57, 16.15, 17.14, 31.07, 31.42, 20.7, 34.29, 32.86, 26.07, 36.90, 37.28],
143
- 'STARK-PRIME_R@20': [32.84, 30.50, 17.07, 32.95, 37.88, 37.34, 25.54, 41.11, 40.99, 27.78, 35.57, 34.05],
144
- 'STARK-PRIME_MRR': [21.68, 13.50, 11.42, 16.27, 23.50, 21.23, 15.00, 24.99, 23.70, 19.98, 26.27, 26.55]
145
- }
146
-
147
- data_human_generated = {
148
- 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
149
- 'STARK-AMAZON_Hit@1': [27.16, 16.05, 25.93, 22.22, 39.50, 35.80, 29.63, 40.74, 46.91, 33.33, 53.09, 50.62],
150
- 'STARK-AMAZON_Hit@5': [51.85, 39.51, 54.32, 49.38, 64.19, 62.96, 46.91, 71.60, 72.84, 55.56, 74.07, 75.31],
151
- 'STARK-AMAZON_R@20': [29.23, 15.23, 23.69, 21.54, 35.46, 33.01, 21.21, 36.30, 40.22, 29.03, 35.46, 35.46],
152
- 'STARK-AMAZON_MRR': [18.79, 27.21, 37.12, 31.33, 52.65, 47.84, 38.61, 53.21, 58.74, 43.77, 62.11, 61.06],
153
- 'STARK-MAG_Hit@1': [32.14, 4.72, 25.00, 20.24, 28.57, 22.62, 16.67, 34.52, 23.81, 33.33, 38.10, 36.90],
154
- 'STARK-MAG_Hit@5': [41.67, 9.52, 30.95, 26.19, 41.67, 36.90, 28.57, 44.04, 41.67, 36.90, 45.24, 46.43],
155
- 'STARK-MAG_R@20': [32.46, 25.00, 27.24, 28.76, 35.95, 32.44, 21.74, 34.57, 39.85, 30.50, 35.95, 35.95],
156
- 'STARK-MAG_MRR': [37.42, 7.90, 27.98, 25.53, 35.81, 29.68, 21.59, 38.72, 31.43, 35.97, 42.00, 40.65],
157
- 'STARK-PRIME_Hit@1': [22.45, 2.04, 7.14, 6.12, 17.35, 16.33, 9.18, 25.51, 24.49, 15.31, 28.57, 28.57],
158
- 'STARK-PRIME_Hit@5': [41.84, 9.18, 13.27, 13.27, 34.69, 32.65, 21.43, 41.84, 39.80, 26.53, 46.94, 44.90],
159
- 'STARK-PRIME_R@20': [42.32, 10.69, 11.72, 17.62, 41.09, 39.01, 26.77, 48.10, 47.21, 25.56, 41.61, 41.61],
160
- 'STARK-PRIME_MRR': [30.37, 7.05, 10.07, 9.39, 26.35, 24.33, 15.24, 34.28, 32.98, 19.67, 36.32, 34.82]
161
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
  # Initialize DataFrames
164
  df_synthesized_full = pd.DataFrame(data_synthesized_full)
165
- df_synthesized_10 = pd.DataFrame(data_synthesized_10)
166
- df_human_generated = pd.DataFrame(data_human_generated)
167
 
168
  # Model type definitions
169
  model_types = {
@@ -347,13 +367,14 @@ def initialize_leaderboard():
347
  """
348
  Initialize the leaderboard with baseline results and submitted results.
349
  """
350
- global df_synthesized_full, df_synthesized_10, df_human_generated
 
351
 
352
  try:
353
  # First, initialize with baseline results
354
  df_synthesized_full = pd.DataFrame(data_synthesized_full)
355
- df_synthesized_10 = pd.DataFrame(data_synthesized_10)
356
- df_human_generated = pd.DataFrame(data_human_generated)
357
 
358
  print("Initialized with baseline results")
359
 
@@ -766,7 +787,6 @@ def process_submission(method_name, team_name, dataset, split, contact_email, co
766
  except Exception as e:
767
  print(f"Warning: Failed to delete temporary file {temp_file}: {str(e)}")
768
 
769
-
770
  def filter_by_model_type(df, selected_types):
771
  """
772
  Filter DataFrame by selected model types, including submitted models.
@@ -795,19 +815,31 @@ def update_tables(selected_types):
795
  Include all models from selected categories.
796
  """
797
  if not selected_types:
798
- return [df.head(0) for df in [df_synthesized_full, df_synthesized_10, df_human_generated]]
 
799
 
800
  filtered_df_full = filter_by_model_type(df_synthesized_full, selected_types)
801
- filtered_df_10 = filter_by_model_type(df_synthesized_10, selected_types)
802
- filtered_df_human = filter_by_model_type(df_human_generated, selected_types)
803
 
804
  outputs = []
805
- for df in [filtered_df_full, filtered_df_10, filtered_df_human]:
806
- for dataset in ['AMAZON', 'MAG', 'PRIME']:
807
- outputs.append(format_dataframe(df, f"STARK-{dataset}"))
 
808
 
809
  return outputs
810
 
 
 
 
 
 
 
 
 
 
 
811
  css = """
812
  table > thead {
813
  white-space: normal
@@ -829,8 +861,8 @@ table > tbody > tr > td:nth-child(2) > div {
829
 
830
  # Main application
831
  with gr.Blocks(css=css) as demo:
832
- gr.Markdown("# Semi-structured Retrieval Benchmark (STaRK) Leaderboard")
833
- gr.Markdown("Refer to the [STaRK paper](https://arxiv.org/pdf/2404.13207) for details on metrics, tasks and models.")
834
 
835
  # Initialize leaderboard at startup
836
  print("Starting leaderboard initialization...")
@@ -870,20 +902,14 @@ with gr.Blocks(css=css) as demo:
870
 
871
  # Submission section
872
 
873
- # Split the text into a list
874
- test_data = os.getenv('LLMJudgeTest').split()
875
- # Reshape the list into a 2D array where each row contains 4 elements
876
- test_data = [test_data[i:i+4] for i in range(0, len(test_data), 4)]
877
- # Create a DataFrame
878
- df = pd.DataFrame(test_data, columns=['qid', 'Q0', 'pid', 'score'])
879
- # Display the DataFrame
880
- print(df)
881
 
882
  gr.Markdown("---")
883
  gr.Markdown("## Submit Your Results:")
884
  gr.Markdown("""
885
  Submit your results to be included in the leaderboard. Please ensure your submission meets all requirements.
886
- For questions, contact stark-qa@cs.stanford.edu. Detailed instructions can be referred at [submission instructions](https://docs.google.com/document/d/11coGjTmOEi9p9-PUq1oy0eTOj8f_8CVQhDl5_0FKT14/edit?usp=sharing).
887
  """)
888
 
889
  with gr.Row():
@@ -933,8 +959,8 @@ with gr.Blocks(css=css) as demo:
933
  placeholder="e.g., 4x NVIDIA A100 80GB"
934
  )
935
  csv_file = gr.File(
936
- label="Prediction CSV*",
937
- file_types=[".csv"],
938
  type="filepath"
939
  )
940
  paper_link = gr.Textbox(
@@ -945,7 +971,6 @@ with gr.Blocks(css=css) as demo:
945
  submit_btn = gr.Button("Submit", variant="primary")
946
  result = gr.Textbox(label="Submission Status", interactive=False)
947
 
948
-
949
  # Set up event handlers
950
  model_type_filter.change(
951
  update_tables,
@@ -973,7 +998,6 @@ with gr.Blocks(css=css) as demo:
973
  inputs=[model_type_filter],
974
  outputs=all_dfs
975
  )
976
-
977
 
978
  # Launch the application
979
  demo.launch()
 
14
  from huggingface_hub import HfApi
15
  import shutil
16
  import tempfile
17
+ from sklearn.metrics import cohen_kappa_score
18
+ import krippendorff
19
 
20
  from stark_qa import load_qa
21
  from stark_qa.evaluator import Evaluator
 
25
 
26
  # Initialize storage once at startup
27
  try:
28
+ REPO_ID = "rahmanidashti/llm-as-a-rel" # Replace with your space name
29
  hub_storage = HubStorage(REPO_ID)
30
  except Exception as e:
31
  raise RuntimeError(f"Failed to initialize HuggingFace Hub storage: {e}")
 
58
  result["idx"], result["query_id"] = idx, query_id
59
  return result
60
 
 
61
  def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int = 4):
62
+ """
63
+ computing the metrics for the evaluation.
64
+
65
+ Parameters:
66
+ csv_path (str): The path to the submission file for evaluation.
67
+ """
68
  candidate_ids_dict = {
69
  'amazon': [i for i in range(957192)],
70
  'mag': [i for i in range(1172724, 1872968)],
71
  'prime': [i for i in range(129375)]
72
  }
73
  try:
74
+ # eval_csv = pd.read_csv(csv_path)
75
+ eval_csv = pd.read_csv(csv_path, sep=" ", header=None, names=['qid', 'Q0', 'docid', 'score'])
76
+ eval_csv['score'] = [0 if x < 0 else 3 if x > 3 else x for x in eval_csv['score']]
77
+ test_eval_df = pd.merge(test_data, eval_csv, on=['qid', 'docid'], how='outer')
78
+ cohen_kappa = cohen_kappa_score(test_eval_df['score_x'], test_eval_df['score_y'])
79
+ krippendorff_alpha = krippendorff.alpha(reliability_data=[test_eval_df['score_x'], test_eval_df['score_y']], value_domain=[0,1,2,3], level_of_measurement='ordinal')
80
+
81
+ # if 'query_id' not in eval_csv.columns:
82
+ # raise ValueError('No `query_id` column found in the submitted csv.')
83
+ # if 'pred_rank' not in eval_csv.columns:
84
+ # raise ValueError('No `pred_rank` column found in the submitted csv.')
85
+
86
+ # eval_csv = eval_csv[['query_id', 'pred_rank']]
87
+
88
+ # if dataset not in candidate_ids_dict:
89
+ # raise ValueError(f"Invalid dataset '{dataset}', expected one of {list(candidate_ids_dict.keys())}.")
90
+ # if split not in ['test', 'test-0.1', 'human_generated_eval']:
91
+ # raise ValueError(f"Invalid split '{split}', expected one of ['test', 'test-0.1', 'human_generated_eval'].")
92
+
93
+ # evaluator = Evaluator(candidate_ids_dict[dataset])
94
+ # eval_metrics = ['hit@1', 'hit@5', 'recall@20', 'mrr']
95
+ # qa_dataset = load_qa(dataset, human_generated_eval=split == 'human_generated_eval')
96
+ # split_idx = qa_dataset.get_idx_split()
97
+ # all_indices = split_idx[split].tolist()
98
+
99
+ # results_list = []
100
+ # query_ids = []
101
 
102
  # Prepare args for each worker
103
+ # args = [(idx, eval_csv, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
104
 
105
+ # with ProcessPoolExecutor(max_workers=num_workers) as executor:
106
+ # futures = [executor.submit(process_single_instance, arg) for arg in args]
107
+ # for future in tqdm(as_completed(futures), total=len(futures)):
108
+ # result = future.result() # This will raise an error if the worker encountered one
109
+ # results_list.append(result)
110
+ # query_ids.append(result['query_id'])
111
 
112
  # Concatenate results and compute final metrics
113
+ # eval_csv = pd.concat([eval_csv, pd.DataFrame(results_list)], ignore_index=True)
114
  final_results = {
115
+ # metric: np.mean(eval_csv[eval_csv['query_id'].isin(query_ids)][metric]) for metric in eval_metrics
116
+ 'kappa': round(cohen_kappa, 4),
117
+ 'alpha': round(krippendorff_alpha, 4)
118
  }
119
  return final_results
120
 
 
125
  except Exception as error:
126
  return f"{error}"
127
 
 
128
  # Data dictionaries for leaderboard
129
  data_synthesized_full = {
130
+ 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)'],
131
+ 'LLMJudge-DL2023_Kappa': [44.94, 15.29, 30.96],
132
+ 'LLMJudge-DL2023_Alpha': [67.42, 47.93, 51.06],
 
 
 
 
 
 
 
 
 
 
133
  }
134
 
135
+ # data_synthesized_full = {
136
+ # 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2'],
137
+ # 'STARK-AMAZON_Hit@1': [44.94, 15.29, 30.96, 26.56, 39.16, 40.93, 21.74, 42.08, 40.07, 46.10],
138
+ # 'STARK-AMAZON_Hit@5': [67.42, 47.93, 51.06, 50.01, 62.73, 64.37, 41.65, 66.87, 64.98, 66.02],
139
+ # 'STARK-AMAZON_R@20': [53.77, 44.49, 41.95, 52.05, 53.29, 54.28, 33.22, 56.52, 55.12, 53.44],
140
+ # 'STARK-AMAZON_MRR': [55.30, 30.20, 40.66, 37.75, 50.35, 51.60, 31.47, 53.46, 51.55, 55.51],
141
+ # 'STARK-MAG_Hit@1': [25.85, 10.51, 21.96, 12.88, 29.08, 30.06, 18.01, 37.90, 25.92, 31.18],
142
+ # 'STARK-MAG_Hit@5': [45.25, 35.23, 36.50, 39.01, 49.61, 50.58, 34.85, 56.74, 50.43, 46.42],
143
+ # 'STARK-MAG_R@20': [45.69, 42.11, 35.32, 46.97, 48.36, 50.49, 35.46, 46.40, 50.80, 43.94],
144
+ # 'STARK-MAG_MRR': [34.91, 21.34, 29.14, 29.12, 38.62, 39.66, 26.10, 47.25, 36.94, 38.39],
145
+ # 'STARK-PRIME_Hit@1': [12.75, 4.46, 6.53, 8.85, 12.63, 10.85, 10.10, 15.57, 15.10, 11.75],
146
+ # 'STARK-PRIME_Hit@5': [27.92, 21.85, 15.67, 21.35, 31.49, 30.23, 22.49, 33.42, 33.56, 23.85],
147
+ # 'STARK-PRIME_R@20': [31.25, 30.13, 16.52, 29.63, 36.00, 37.83, 26.34, 39.09, 38.05, 25.04],
148
+ # 'STARK-PRIME_MRR': [19.84, 12.38, 11.05, 14.73, 21.41, 19.99, 16.12, 24.11, 23.49, 17.39]
149
+ # }
150
+
151
+ # data_synthesized_10 = {
152
+ # 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
153
+ # 'STARK-AMAZON_Hit@1': [42.68, 16.46, 30.09, 25.00, 39.02, 43.29, 18.90, 43.29, 40.85, 44.31, 45.49, 44.79],
154
+ # 'STARK-AMAZON_Hit@5': [67.07, 50.00, 49.27, 48.17, 64.02, 67.68, 37.80, 71.34, 62.80, 65.24, 71.13, 71.17],
155
+ # 'STARK-AMAZON_R@20': [54.48, 42.15, 41.91, 51.65, 49.30, 56.04, 34.73, 56.14, 52.47, 51.00, 53.77, 55.35],
156
+ # 'STARK-AMAZON_MRR': [54.02, 30.20, 39.30, 36.87, 50.32, 54.20, 28.76, 55.07, 51.54, 55.07, 55.91, 55.69],
157
+ # 'STARK-MAG_Hit@1': [27.81, 11.65, 22.89, 12.03, 28.20, 34.59, 19.17, 38.35, 25.56, 31.58, 36.54, 40.90],
158
+ # 'STARK-MAG_Hit@5': [45.48, 36.84, 37.26, 37.97, 52.63, 50.75, 33.46, 58.64, 50.37, 47.36, 53.17, 58.18],
159
+ # 'STARK-MAG_R@20': [44.59, 42.30, 44.16, 47.98, 49.25, 50.75, 29.85, 46.38, 53.03, 45.72, 48.36, 48.60],
160
+ # 'STARK-MAG_MRR': [35.97, 21.82, 30.00, 28.70, 38.55, 42.90, 26.06, 48.25, 36.82, 38.98, 44.15, 49.00],
161
+ # 'STARK-PRIME_Hit@1': [13.93, 5.00, 6.78, 7.14, 15.36, 12.14, 9.29, 16.79, 15.36, 15.00, 17.79, 18.28],
162
+ # 'STARK-PRIME_Hit@5': [31.07, 23.57, 16.15, 17.14, 31.07, 31.42, 20.7, 34.29, 32.86, 26.07, 36.90, 37.28],
163
+ # 'STARK-PRIME_R@20': [32.84, 30.50, 17.07, 32.95, 37.88, 37.34, 25.54, 41.11, 40.99, 27.78, 35.57, 34.05],
164
+ # 'STARK-PRIME_MRR': [21.68, 13.50, 11.42, 16.27, 23.50, 21.23, 15.00, 24.99, 23.70, 19.98, 26.27, 26.55]
165
+ # }
166
+
167
+ # data_human_generated = {
168
+ # 'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
169
+ # 'STARK-AMAZON_Hit@1': [27.16, 16.05, 25.93, 22.22, 39.50, 35.80, 29.63, 40.74, 46.91, 33.33, 53.09, 50.62],
170
+ # 'STARK-AMAZON_Hit@5': [51.85, 39.51, 54.32, 49.38, 64.19, 62.96, 46.91, 71.60, 72.84, 55.56, 74.07, 75.31],
171
+ # 'STARK-AMAZON_R@20': [29.23, 15.23, 23.69, 21.54, 35.46, 33.01, 21.21, 36.30, 40.22, 29.03, 35.46, 35.46],
172
+ # 'STARK-AMAZON_MRR': [18.79, 27.21, 37.12, 31.33, 52.65, 47.84, 38.61, 53.21, 58.74, 43.77, 62.11, 61.06],
173
+ # 'STARK-MAG_Hit@1': [32.14, 4.72, 25.00, 20.24, 28.57, 22.62, 16.67, 34.52, 23.81, 33.33, 38.10, 36.90],
174
+ # 'STARK-MAG_Hit@5': [41.67, 9.52, 30.95, 26.19, 41.67, 36.90, 28.57, 44.04, 41.67, 36.90, 45.24, 46.43],
175
+ # 'STARK-MAG_R@20': [32.46, 25.00, 27.24, 28.76, 35.95, 32.44, 21.74, 34.57, 39.85, 30.50, 35.95, 35.95],
176
+ # 'STARK-MAG_MRR': [37.42, 7.90, 27.98, 25.53, 35.81, 29.68, 21.59, 38.72, 31.43, 35.97, 42.00, 40.65],
177
+ # 'STARK-PRIME_Hit@1': [22.45, 2.04, 7.14, 6.12, 17.35, 16.33, 9.18, 25.51, 24.49, 15.31, 28.57, 28.57],
178
+ # 'STARK-PRIME_Hit@5': [41.84, 9.18, 13.27, 13.27, 34.69, 32.65, 21.43, 41.84, 39.80, 26.53, 46.94, 44.90],
179
+ # 'STARK-PRIME_R@20': [42.32, 10.69, 11.72, 17.62, 41.09, 39.01, 26.77, 48.10, 47.21, 25.56, 41.61, 41.61],
180
+ # 'STARK-PRIME_MRR': [30.37, 7.05, 10.07, 9.39, 26.35, 24.33, 15.24, 34.28, 32.98, 19.67, 36.32, 34.82]
181
+ # }
182
 
183
  # Initialize DataFrames
184
  df_synthesized_full = pd.DataFrame(data_synthesized_full)
185
+ # df_synthesized_10 = pd.DataFrame(data_synthesized_10)
186
+ # df_human_generated = pd.DataFrame(data_human_generated)
187
 
188
  # Model type definitions
189
  model_types = {
 
367
  """
368
  Initialize the leaderboard with baseline results and submitted results.
369
  """
370
+ # global df_synthesized_full, df_synthesized_10, df_human_generated
371
+ global df_synthesized_full
372
 
373
  try:
374
  # First, initialize with baseline results
375
  df_synthesized_full = pd.DataFrame(data_synthesized_full)
376
+ # df_synthesized_10 = pd.DataFrame(data_synthesized_10)
377
+ # df_human_generated = pd.DataFrame(data_human_generated)
378
 
379
  print("Initialized with baseline results")
380
 
 
787
  except Exception as e:
788
  print(f"Warning: Failed to delete temporary file {temp_file}: {str(e)}")
789
 
 
790
  def filter_by_model_type(df, selected_types):
791
  """
792
  Filter DataFrame by selected model types, including submitted models.
 
815
  Include all models from selected categories.
816
  """
817
  if not selected_types:
818
+ # return [df.head(0) for df in [df_synthesized_full, df_synthesized_10, df_human_generated]]
819
+ return [df.head(0) for df in [df_synthesized_full]]
820
 
821
  filtered_df_full = filter_by_model_type(df_synthesized_full, selected_types)
822
+ # filtered_df_10 = filter_by_model_type(df_synthesized_10, selected_types)
823
+ # filtered_df_human = filter_by_model_type(df_human_generated, selected_types)
824
 
825
  outputs = []
826
+ # for df in [filtered_df_full, filtered_df_10, filtered_df_human]:
827
+ for df in [filtered_df_full]:
828
+ for dataset in ['DL2023', 'MAG', 'PRIME']:
829
+ outputs.append(format_dataframe(df, f"LLMJudge-{dataset}"))
830
 
831
  return outputs
832
 
833
+ def load_test_data():
834
+ # Split the text into a list
835
+ test_data = os.getenv('LLMJudgeTest').split()
836
+ # Reshape the list into a 2D array where each row contains 4 elements
837
+ test_data = [test_data[i:i+4] for i in range(0, len(test_data), 4)]
838
+ # Create a DataFrame
839
+ test_data = pd.DataFrame(test_data, columns=['qid', 'Q0', 'pid', 'score'])
840
+
841
+ return test_data
842
+
843
  css = """
844
  table > thead {
845
  white-space: normal
 
861
 
862
  # Main application
863
  with gr.Blocks(css=css) as demo:
864
+ gr.Markdown("# LLM-as-a-Rel: Automatic Relevance Judgment Leaderboard")
865
+ gr.Markdown("Refer to the [LLMJudge overview paper](https://arxiv.org/pdf/2408.08896) for details on metrics, tasks and models.")
866
 
867
  # Initialize leaderboard at startup
868
  print("Starting leaderboard initialization...")
 
902
 
903
  # Submission section
904
 
905
+ # load test data
906
+ test_data = load_test_data()
 
 
 
 
 
 
907
 
908
  gr.Markdown("---")
909
  gr.Markdown("## Submit Your Results:")
910
  gr.Markdown("""
911
  Submit your results to be included in the leaderboard. Please ensure your submission meets all requirements.
912
+ For questions, contact hossein.rahmani.22@ucl.ac.uk. Detailed instructions can be referred at [submission instructions](https://docs.google.com/document/d/11coGjTmOEi9p9-PUq1oy0eTOj8f_8CVQhDl5_0FKT14/edit?usp=sharing).
913
  """)
914
 
915
  with gr.Row():
 
959
  placeholder="e.g., 4x NVIDIA A100 80GB"
960
  )
961
  csv_file = gr.File(
962
+ label="Prediction TXT*",
963
+ file_types=[".txt"],
964
  type="filepath"
965
  )
966
  paper_link = gr.Textbox(
 
971
  submit_btn = gr.Button("Submit", variant="primary")
972
  result = gr.Textbox(label="Submission Status", interactive=False)
973
 
 
974
  # Set up event handlers
975
  model_type_filter.change(
976
  update_tables,
 
998
  inputs=[model_type_filter],
999
  outputs=all_dfs
1000
  )
 
1001
 
1002
  # Launch the application
1003
  demo.launch()