Spaces:

rahmanidashti
/

llm-as-a-rel

Sleeping

App Files Files Community

rahmanidashti commited on Nov 7, 2024

Commit

43dd1e4

1 Parent(s): 0d282c0

upgrade leaderboard

Browse files

Files changed (1) hide show

app.py +128 -104

app.py CHANGED Viewed

@@ -14,6 +14,8 @@ from email.mime.text import MIMEText
 from huggingface_hub import HfApi
 import shutil
 import tempfile
 from stark_qa import load_qa
 from stark_qa.evaluator import Evaluator
@@ -23,7 +25,7 @@ from utils.token_handler import TokenHandler
 # Initialize storage once at startup
 try:
-    REPO_ID = "snap-stanford/stark-leaderboard"  # Replace with your space name
     hub_storage = HubStorage(REPO_ID)
 except Exception as e:
     raise RuntimeError(f"Failed to initialize HuggingFace Hub storage: {e}")
@@ -56,50 +58,63 @@ def process_single_instance(args):
     result["idx"], result["query_id"] = idx, query_id
     return result
 def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int = 4):
     candidate_ids_dict = {
         'amazon': [i for i in range(957192)],
         'mag': [i for i in range(1172724, 1872968)],
         'prime': [i for i in range(129375)]
     }
     try:
-        eval_csv = pd.read_csv(csv_path)
-        if 'query_id' not in eval_csv.columns:
-            raise ValueError('No `query_id` column found in the submitted csv.')
-        if 'pred_rank' not in eval_csv.columns:
-            raise ValueError('No `pred_rank` column found in the submitted csv.')
-        eval_csv = eval_csv[['query_id', 'pred_rank']]
-        if dataset not in candidate_ids_dict:
-            raise ValueError(f"Invalid dataset '{dataset}', expected one of {list(candidate_ids_dict.keys())}.")
-        if split not in ['test', 'test-0.1', 'human_generated_eval']:
-            raise ValueError(f"Invalid split '{split}', expected one of ['test', 'test-0.1', 'human_generated_eval'].")
-        evaluator = Evaluator(candidate_ids_dict[dataset])
-        eval_metrics = ['hit@1', 'hit@5', 'recall@20', 'mrr']
-        qa_dataset = load_qa(dataset, human_generated_eval=split == 'human_generated_eval')
-        split_idx = qa_dataset.get_idx_split()
-        all_indices = split_idx[split].tolist()
-        results_list = []
-        query_ids = []
         # Prepare args for each worker
-        args = [(idx, eval_csv, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
-        with ProcessPoolExecutor(max_workers=num_workers) as executor:
-            futures = [executor.submit(process_single_instance, arg) for arg in args]
-            for future in tqdm(as_completed(futures), total=len(futures)):
-                result = future.result()  # This will raise an error if the worker encountered one
-                results_list.append(result)
-                query_ids.append(result['query_id'])
         # Concatenate results and compute final metrics
-        eval_csv = pd.concat([eval_csv, pd.DataFrame(results_list)], ignore_index=True)
         final_results = {
-            metric: np.mean(eval_csv[eval_csv['query_id'].isin(query_ids)][metric]) for metric in eval_metrics
         }
         return final_results
@@ -110,60 +125,65 @@ def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int =
     except Exception as error:
         return f"{error}"
 # Data dictionaries for leaderboard
 data_synthesized_full = {
-    'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2'],
-    'STARK-AMAZON_Hit@1': [44.94, 15.29, 30.96, 26.56, 39.16, 40.93, 21.74, 42.08, 40.07, 46.10],
-    'STARK-AMAZON_Hit@5': [67.42, 47.93, 51.06, 50.01, 62.73, 64.37, 41.65, 66.87, 64.98, 66.02],
-    'STARK-AMAZON_R@20': [53.77, 44.49, 41.95, 52.05, 53.29, 54.28, 33.22, 56.52, 55.12, 53.44],
-    'STARK-AMAZON_MRR': [55.30, 30.20, 40.66, 37.75, 50.35, 51.60, 31.47, 53.46, 51.55, 55.51],
-    'STARK-MAG_Hit@1': [25.85, 10.51, 21.96, 12.88, 29.08, 30.06, 18.01, 37.90, 25.92, 31.18],
-    'STARK-MAG_Hit@5': [45.25, 35.23, 36.50, 39.01, 49.61, 50.58, 34.85, 56.74, 50.43, 46.42],
-    'STARK-MAG_R@20': [45.69, 42.11, 35.32, 46.97, 48.36, 50.49, 35.46, 46.40, 50.80, 43.94],
-    'STARK-MAG_MRR': [34.91, 21.34, 29.14, 29.12, 38.62, 39.66, 26.10, 47.25, 36.94, 38.39],
-    'STARK-PRIME_Hit@1': [12.75, 4.46, 6.53, 8.85, 12.63, 10.85, 10.10, 15.57, 15.10, 11.75],
-    'STARK-PRIME_Hit@5': [27.92, 21.85, 15.67, 21.35, 31.49, 30.23, 22.49, 33.42, 33.56, 23.85],
-    'STARK-PRIME_R@20': [31.25, 30.13, 16.52, 29.63, 36.00, 37.83, 26.34, 39.09, 38.05, 25.04],
-    'STARK-PRIME_MRR': [19.84, 12.38, 11.05, 14.73, 21.41, 19.99, 16.12, 24.11, 23.49, 17.39]
 }
-data_synthesized_10 = {
-    'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
-    'STARK-AMAZON_Hit@1': [42.68, 16.46, 30.09, 25.00, 39.02, 43.29, 18.90, 43.29, 40.85, 44.31, 45.49, 44.79],
-    'STARK-AMAZON_Hit@5': [67.07, 50.00, 49.27, 48.17, 64.02, 67.68, 37.80, 71.34, 62.80, 65.24, 71.13, 71.17],
-    'STARK-AMAZON_R@20': [54.48, 42.15, 41.91, 51.65, 49.30, 56.04, 34.73, 56.14, 52.47, 51.00, 53.77, 55.35],
-    'STARK-AMAZON_MRR': [54.02, 30.20, 39.30, 36.87, 50.32, 54.20, 28.76, 55.07, 51.54, 55.07, 55.91, 55.69],
-    'STARK-MAG_Hit@1': [27.81, 11.65, 22.89, 12.03, 28.20, 34.59, 19.17, 38.35, 25.56, 31.58, 36.54, 40.90],
-    'STARK-MAG_Hit@5': [45.48, 36.84, 37.26, 37.97, 52.63, 50.75, 33.46, 58.64, 50.37, 47.36, 53.17, 58.18],
-    'STARK-MAG_R@20': [44.59, 42.30, 44.16, 47.98, 49.25, 50.75, 29.85, 46.38, 53.03, 45.72, 48.36, 48.60],
-    'STARK-MAG_MRR': [35.97, 21.82, 30.00, 28.70, 38.55, 42.90, 26.06, 48.25, 36.82, 38.98, 44.15, 49.00],
-    'STARK-PRIME_Hit@1': [13.93, 5.00, 6.78, 7.14, 15.36, 12.14, 9.29, 16.79, 15.36, 15.00, 17.79, 18.28],
-    'STARK-PRIME_Hit@5': [31.07, 23.57, 16.15, 17.14, 31.07, 31.42, 20.7, 34.29, 32.86, 26.07, 36.90, 37.28],
-    'STARK-PRIME_R@20': [32.84, 30.50, 17.07, 32.95, 37.88, 37.34, 25.54, 41.11, 40.99, 27.78, 35.57, 34.05],
-    'STARK-PRIME_MRR': [21.68, 13.50, 11.42, 16.27, 23.50, 21.23, 15.00, 24.99, 23.70, 19.98, 26.27, 26.55]
-}
-data_human_generated = {
-    'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
-    'STARK-AMAZON_Hit@1': [27.16, 16.05, 25.93, 22.22, 39.50, 35.80, 29.63, 40.74, 46.91, 33.33, 53.09, 50.62],
-    'STARK-AMAZON_Hit@5': [51.85, 39.51, 54.32, 49.38, 64.19, 62.96, 46.91, 71.60, 72.84, 55.56, 74.07, 75.31],
-    'STARK-AMAZON_R@20': [29.23, 15.23, 23.69, 21.54, 35.46, 33.01, 21.21, 36.30, 40.22, 29.03, 35.46, 35.46],
-    'STARK-AMAZON_MRR': [18.79, 27.21, 37.12, 31.33, 52.65, 47.84, 38.61, 53.21, 58.74, 43.77, 62.11, 61.06],
-    'STARK-MAG_Hit@1': [32.14, 4.72, 25.00, 20.24, 28.57, 22.62, 16.67, 34.52, 23.81, 33.33, 38.10, 36.90],
-    'STARK-MAG_Hit@5': [41.67, 9.52, 30.95, 26.19, 41.67, 36.90, 28.57, 44.04, 41.67, 36.90, 45.24, 46.43],
-    'STARK-MAG_R@20': [32.46, 25.00, 27.24, 28.76, 35.95, 32.44, 21.74, 34.57, 39.85, 30.50, 35.95, 35.95],
-    'STARK-MAG_MRR': [37.42, 7.90, 27.98, 25.53, 35.81, 29.68, 21.59, 38.72, 31.43, 35.97, 42.00, 40.65],
-    'STARK-PRIME_Hit@1': [22.45, 2.04, 7.14, 6.12, 17.35, 16.33, 9.18, 25.51, 24.49, 15.31, 28.57, 28.57],
-    'STARK-PRIME_Hit@5': [41.84, 9.18, 13.27, 13.27, 34.69, 32.65, 21.43, 41.84, 39.80, 26.53, 46.94, 44.90],
-    'STARK-PRIME_R@20': [42.32, 10.69, 11.72, 17.62, 41.09, 39.01, 26.77, 48.10, 47.21, 25.56, 41.61, 41.61],
-    'STARK-PRIME_MRR': [30.37, 7.05, 10.07, 9.39, 26.35, 24.33, 15.24, 34.28, 32.98, 19.67, 36.32, 34.82]
-}
 # Initialize DataFrames
 df_synthesized_full = pd.DataFrame(data_synthesized_full)
-df_synthesized_10 = pd.DataFrame(data_synthesized_10)
-df_human_generated = pd.DataFrame(data_human_generated)
 # Model type definitions
 model_types = {
@@ -347,13 +367,14 @@ def initialize_leaderboard():
     """
     Initialize the leaderboard with baseline results and submitted results.
     """
-    global df_synthesized_full, df_synthesized_10, df_human_generated
     try:
         # First, initialize with baseline results
         df_synthesized_full = pd.DataFrame(data_synthesized_full)
-        df_synthesized_10 = pd.DataFrame(data_synthesized_10)
-        df_human_generated = pd.DataFrame(data_human_generated)
         print("Initialized with baseline results")
@@ -766,7 +787,6 @@ def process_submission(method_name, team_name, dataset, split, contact_email, co
             except Exception as e:
                 print(f"Warning: Failed to delete temporary file {temp_file}: {str(e)}")
 def filter_by_model_type(df, selected_types):
     """
     Filter DataFrame by selected model types, including submitted models.
@@ -795,19 +815,31 @@ def update_tables(selected_types):
     Include all models from selected categories.
     """
     if not selected_types:
-        return [df.head(0) for df in [df_synthesized_full, df_synthesized_10, df_human_generated]]
     filtered_df_full = filter_by_model_type(df_synthesized_full, selected_types)
-    filtered_df_10 = filter_by_model_type(df_synthesized_10, selected_types)
-    filtered_df_human = filter_by_model_type(df_human_generated, selected_types)
     outputs = []
-    for df in [filtered_df_full, filtered_df_10, filtered_df_human]:
-        for dataset in ['AMAZON', 'MAG', 'PRIME']:
-            outputs.append(format_dataframe(df, f"STARK-{dataset}"))
     return outputs
 css = """
 table > thead {
     white-space: normal
@@ -829,8 +861,8 @@ table > tbody > tr > td:nth-child(2) > div {
 # Main application
 with gr.Blocks(css=css) as demo:
-    gr.Markdown("# Semi-structured Retrieval Benchmark (STaRK) Leaderboard")
-    gr.Markdown("Refer to the [STaRK paper](https://arxiv.org/pdf/2404.13207) for details on metrics, tasks and models.")
     # Initialize leaderboard at startup
     print("Starting leaderboard initialization...")
@@ -870,20 +902,14 @@ with gr.Blocks(css=css) as demo:
     # Submission section
-    # Split the text into a list
-    test_data = os.getenv('LLMJudgeTest').split()
-    # Reshape the list into a 2D array where each row contains 4 elements
-    test_data = [test_data[i:i+4] for i in range(0, len(test_data), 4)]
-    # Create a DataFrame
-    df = pd.DataFrame(test_data, columns=['qid', 'Q0', 'pid', 'score'])
-    # Display the DataFrame
-    print(df)
     gr.Markdown("---")
     gr.Markdown("## Submit Your Results:")
     gr.Markdown("""
     Submit your results to be included in the leaderboard. Please ensure your submission meets all requirements.
-    For questions, contact stark-qa@cs.stanford.edu. Detailed instructions can be referred at [submission instructions](https://docs.google.com/document/d/11coGjTmOEi9p9-PUq1oy0eTOj8f_8CVQhDl5_0FKT14/edit?usp=sharing).
     """)
     with gr.Row():
@@ -933,8 +959,8 @@ with gr.Blocks(css=css) as demo:
                 placeholder="e.g., 4x NVIDIA A100 80GB"
             )
             csv_file = gr.File(
-                label="Prediction CSV*",
-                file_types=[".csv"],
                 type="filepath"
             )
             paper_link = gr.Textbox(
@@ -945,7 +971,6 @@ with gr.Blocks(css=css) as demo:
     submit_btn = gr.Button("Submit", variant="primary")
     result = gr.Textbox(label="Submission Status", interactive=False)
     # Set up event handlers
     model_type_filter.change(
         update_tables,
@@ -973,7 +998,6 @@ with gr.Blocks(css=css) as demo:
         inputs=[model_type_filter],
         outputs=all_dfs
     )
 # Launch the application
 demo.launch()

 from huggingface_hub import HfApi
 import shutil
 import tempfile
+from sklearn.metrics import cohen_kappa_score
+import krippendorff
 from stark_qa import load_qa
 from stark_qa.evaluator import Evaluator
 # Initialize storage once at startup
 try:
+    REPO_ID = "rahmanidashti/llm-as-a-rel"  # Replace with your space name
     hub_storage = HubStorage(REPO_ID)
 except Exception as e:
     raise RuntimeError(f"Failed to initialize HuggingFace Hub storage: {e}")
     result["idx"], result["query_id"] = idx, query_id
     return result
 def compute_metrics(csv_path: str, dataset: str, split: str, num_workers: int = 4):
+    """
+    computing the metrics for the evaluation.
+    Parameters:
+    csv_path (str): The path to the submission file for evaluation.
+    """
     candidate_ids_dict = {
         'amazon': [i for i in range(957192)],
         'mag': [i for i in range(1172724, 1872968)],
         'prime': [i for i in range(129375)]
     }
     try:
+        # eval_csv = pd.read_csv(csv_path)
+        eval_csv = pd.read_csv(csv_path, sep=" ", header=None, names=['qid', 'Q0', 'docid', 'score'])
+        eval_csv['score'] = [0 if x < 0 else 3 if x > 3 else x for x in eval_csv['score']]
+        test_eval_df = pd.merge(test_data, eval_csv, on=['qid', 'docid'], how='outer')
+        cohen_kappa = cohen_kappa_score(test_eval_df['score_x'], test_eval_df['score_y'])
+        krippendorff_alpha = krippendorff.alpha(reliability_data=[test_eval_df['score_x'], test_eval_df['score_y']], value_domain=[0,1,2,3], level_of_measurement='ordinal')
+        # if 'query_id' not in eval_csv.columns:
+            # raise ValueError('No `query_id` column found in the submitted csv.')
+        # if 'pred_rank' not in eval_csv.columns:
+            # raise ValueError('No `pred_rank` column found in the submitted csv.')
+        # eval_csv = eval_csv[['query_id', 'pred_rank']]
+        # if dataset not in candidate_ids_dict:
+        #     raise ValueError(f"Invalid dataset '{dataset}', expected one of {list(candidate_ids_dict.keys())}.")
+        # if split not in ['test', 'test-0.1', 'human_generated_eval']:
+        #     raise ValueError(f"Invalid split '{split}', expected one of ['test', 'test-0.1', 'human_generated_eval'].")
+        # evaluator = Evaluator(candidate_ids_dict[dataset])
+        # eval_metrics = ['hit@1', 'hit@5', 'recall@20', 'mrr']
+        # qa_dataset = load_qa(dataset, human_generated_eval=split == 'human_generated_eval')
+        # split_idx = qa_dataset.get_idx_split()
+        # all_indices = split_idx[split].tolist()
+        # results_list = []
+        # query_ids = []
         # Prepare args for each worker
+        # args = [(idx, eval_csv, qa_dataset, evaluator, eval_metrics) for idx in all_indices]
+        # with ProcessPoolExecutor(max_workers=num_workers) as executor:
+        #     futures = [executor.submit(process_single_instance, arg) for arg in args]
+        #     for future in tqdm(as_completed(futures), total=len(futures)):
+        #         result = future.result()  # This will raise an error if the worker encountered one
+        #         results_list.append(result)
+        #         query_ids.append(result['query_id'])
         # Concatenate results and compute final metrics
+        # eval_csv = pd.concat([eval_csv, pd.DataFrame(results_list)], ignore_index=True)
         final_results = {
+            # metric: np.mean(eval_csv[eval_csv['query_id'].isin(query_ids)][metric]) for metric in eval_metrics
+            'kappa': round(cohen_kappa, 4),
+            'alpha': round(krippendorff_alpha, 4)
         }
         return final_results
     except Exception as error:
         return f"{error}"
 # Data dictionaries for leaderboard
 data_synthesized_full = {
+    'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)'],
+    'LLMJudge-DL2023_Kappa': [44.94, 15.29, 30.96],
+    'LLMJudge-DL2023_Alpha': [67.42, 47.93, 51.06],
 }
+# data_synthesized_full = {
+#     'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2'],
+#     'STARK-AMAZON_Hit@1': [44.94, 15.29, 30.96, 26.56, 39.16, 40.93, 21.74, 42.08, 40.07, 46.10],
+#     'STARK-AMAZON_Hit@5': [67.42, 47.93, 51.06, 50.01, 62.73, 64.37, 41.65, 66.87, 64.98, 66.02],
+#     'STARK-AMAZON_R@20': [53.77, 44.49, 41.95, 52.05, 53.29, 54.28, 33.22, 56.52, 55.12, 53.44],
+#     'STARK-AMAZON_MRR': [55.30, 30.20, 40.66, 37.75, 50.35, 51.60, 31.47, 53.46, 51.55, 55.51],
+#     'STARK-MAG_Hit@1': [25.85, 10.51, 21.96, 12.88, 29.08, 30.06, 18.01, 37.90, 25.92, 31.18],
+#     'STARK-MAG_Hit@5': [45.25, 35.23, 36.50, 39.01, 49.61, 50.58, 34.85, 56.74, 50.43, 46.42],
+#     'STARK-MAG_R@20': [45.69, 42.11, 35.32, 46.97, 48.36, 50.49, 35.46, 46.40, 50.80, 43.94],
+#     'STARK-MAG_MRR': [34.91, 21.34, 29.14, 29.12, 38.62, 39.66, 26.10, 47.25, 36.94, 38.39],
+#     'STARK-PRIME_Hit@1': [12.75, 4.46, 6.53, 8.85, 12.63, 10.85, 10.10, 15.57, 15.10, 11.75],
+#     'STARK-PRIME_Hit@5': [27.92, 21.85, 15.67, 21.35, 31.49, 30.23, 22.49, 33.42, 33.56, 23.85],
+#     'STARK-PRIME_R@20': [31.25, 30.13, 16.52, 29.63, 36.00, 37.83, 26.34, 39.09, 38.05, 25.04],
+#     'STARK-PRIME_MRR': [19.84, 12.38, 11.05, 14.73, 21.41, 19.99, 16.12, 24.11, 23.49, 17.39]
+# }
+# data_synthesized_10 = {
+#     'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
+#     'STARK-AMAZON_Hit@1': [42.68, 16.46, 30.09, 25.00, 39.02, 43.29, 18.90, 43.29, 40.85, 44.31, 45.49, 44.79],
+#     'STARK-AMAZON_Hit@5': [67.07, 50.00, 49.27, 48.17, 64.02, 67.68, 37.80, 71.34, 62.80, 65.24, 71.13, 71.17],
+#     'STARK-AMAZON_R@20': [54.48, 42.15, 41.91, 51.65, 49.30, 56.04, 34.73, 56.14, 52.47, 51.00, 53.77, 55.35],
+#     'STARK-AMAZON_MRR': [54.02, 30.20, 39.30, 36.87, 50.32, 54.20, 28.76, 55.07, 51.54, 55.07, 55.91, 55.69],
+#     'STARK-MAG_Hit@1': [27.81, 11.65, 22.89, 12.03, 28.20, 34.59, 19.17, 38.35, 25.56, 31.58, 36.54, 40.90],
+#     'STARK-MAG_Hit@5': [45.48, 36.84, 37.26, 37.97, 52.63, 50.75, 33.46, 58.64, 50.37, 47.36, 53.17, 58.18],
+#     'STARK-MAG_R@20': [44.59, 42.30, 44.16, 47.98, 49.25, 50.75, 29.85, 46.38, 53.03, 45.72, 48.36, 48.60],
+#     'STARK-MAG_MRR': [35.97, 21.82, 30.00, 28.70, 38.55, 42.90, 26.06, 48.25, 36.82, 38.98, 44.15, 49.00],
+#     'STARK-PRIME_Hit@1': [13.93, 5.00, 6.78, 7.14, 15.36, 12.14, 9.29, 16.79, 15.36, 15.00, 17.79, 18.28],
+#     'STARK-PRIME_Hit@5': [31.07, 23.57, 16.15, 17.14, 31.07, 31.42, 20.7, 34.29, 32.86, 26.07, 36.90, 37.28],
+#     'STARK-PRIME_R@20': [32.84, 30.50, 17.07, 32.95, 37.88, 37.34, 25.54, 41.11, 40.99, 27.78, 35.57, 34.05],
+#     'STARK-PRIME_MRR': [21.68, 13.50, 11.42, 16.27, 23.50, 21.23, 15.00, 24.99, 23.70, 19.98, 26.27, 26.55]
+# }
+# data_human_generated = {
+#     'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
+#     'STARK-AMAZON_Hit@1': [27.16, 16.05, 25.93, 22.22, 39.50, 35.80, 29.63, 40.74, 46.91, 33.33, 53.09, 50.62],
+#     'STARK-AMAZON_Hit@5': [51.85, 39.51, 54.32, 49.38, 64.19, 62.96, 46.91, 71.60, 72.84, 55.56, 74.07, 75.31],
+#     'STARK-AMAZON_R@20': [29.23, 15.23, 23.69, 21.54, 35.46, 33.01, 21.21, 36.30, 40.22, 29.03, 35.46, 35.46],
+#     'STARK-AMAZON_MRR': [18.79, 27.21, 37.12, 31.33, 52.65, 47.84, 38.61, 53.21, 58.74, 43.77, 62.11, 61.06],
+#     'STARK-MAG_Hit@1': [32.14, 4.72, 25.00, 20.24, 28.57, 22.62, 16.67, 34.52, 23.81, 33.33, 38.10, 36.90],
+#     'STARK-MAG_Hit@5': [41.67, 9.52, 30.95, 26.19, 41.67, 36.90, 28.57, 44.04, 41.67, 36.90, 45.24, 46.43],
+#     'STARK-MAG_R@20': [32.46, 25.00, 27.24, 28.76, 35.95, 32.44, 21.74, 34.57, 39.85, 30.50, 35.95, 35.95],
+#     'STARK-MAG_MRR': [37.42, 7.90, 27.98, 25.53, 35.81, 29.68, 21.59, 38.72, 31.43, 35.97, 42.00, 40.65],
+#     'STARK-PRIME_Hit@1': [22.45, 2.04, 7.14, 6.12, 17.35, 16.33, 9.18, 25.51, 24.49, 15.31, 28.57, 28.57],
+#     'STARK-PRIME_Hit@5': [41.84, 9.18, 13.27, 13.27, 34.69, 32.65, 21.43, 41.84, 39.80, 26.53, 46.94, 44.90],
+#     'STARK-PRIME_R@20': [42.32, 10.69, 11.72, 17.62, 41.09, 39.01, 26.77, 48.10, 47.21, 25.56, 41.61, 41.61],
+#     'STARK-PRIME_MRR': [30.37, 7.05, 10.07, 9.39, 26.35, 24.33, 15.24, 34.28, 32.98, 19.67, 36.32, 34.82]
+# }
 # Initialize DataFrames
 df_synthesized_full = pd.DataFrame(data_synthesized_full)
+# df_synthesized_10 = pd.DataFrame(data_synthesized_10)
+# df_human_generated = pd.DataFrame(data_human_generated)
 # Model type definitions
 model_types = {
     """
     Initialize the leaderboard with baseline results and submitted results.
     """
+    # global df_synthesized_full, df_synthesized_10, df_human_generated
+    global df_synthesized_full
     try:
         # First, initialize with baseline results
         df_synthesized_full = pd.DataFrame(data_synthesized_full)
+        # df_synthesized_10 = pd.DataFrame(data_synthesized_10)
+        # df_human_generated = pd.DataFrame(data_human_generated)
         print("Initialized with baseline results")
             except Exception as e:
                 print(f"Warning: Failed to delete temporary file {temp_file}: {str(e)}")
 def filter_by_model_type(df, selected_types):
     """
     Filter DataFrame by selected model types, including submitted models.
     Include all models from selected categories.
     """
     if not selected_types:
+        # return [df.head(0) for df in [df_synthesized_full, df_synthesized_10, df_human_generated]]
+        return [df.head(0) for df in [df_synthesized_full]]
     filtered_df_full = filter_by_model_type(df_synthesized_full, selected_types)
+    # filtered_df_10 = filter_by_model_type(df_synthesized_10, selected_types)
+    # filtered_df_human = filter_by_model_type(df_human_generated, selected_types)
     outputs = []
+    # for df in [filtered_df_full, filtered_df_10, filtered_df_human]:
+    for df in [filtered_df_full]:
+        for dataset in ['DL2023', 'MAG', 'PRIME']:
+            outputs.append(format_dataframe(df, f"LLMJudge-{dataset}"))
     return outputs
+def load_test_data():
+    # Split the text into a list
+    test_data = os.getenv('LLMJudgeTest').split()
+    # Reshape the list into a 2D array where each row contains 4 elements
+    test_data = [test_data[i:i+4] for i in range(0, len(test_data), 4)]
+    # Create a DataFrame
+    test_data = pd.DataFrame(test_data, columns=['qid', 'Q0', 'pid', 'score'])
+    return test_data
 css = """
 table > thead {
     white-space: normal
 # Main application
 with gr.Blocks(css=css) as demo:
+    gr.Markdown("# LLM-as-a-Rel: Automatic Relevance Judgment Leaderboard")
+    gr.Markdown("Refer to the [LLMJudge overview paper](https://arxiv.org/pdf/2408.08896) for details on metrics, tasks and models.")
     # Initialize leaderboard at startup
     print("Starting leaderboard initialization...")
     # Submission section
+    # load test data
+    test_data = load_test_data()
     gr.Markdown("---")
     gr.Markdown("## Submit Your Results:")
     gr.Markdown("""
     Submit your results to be included in the leaderboard. Please ensure your submission meets all requirements.
+    For questions, contact hossein.rahmani.22@ucl.ac.uk. Detailed instructions can be referred at [submission instructions](https://docs.google.com/document/d/11coGjTmOEi9p9-PUq1oy0eTOj8f_8CVQhDl5_0FKT14/edit?usp=sharing).
     """)
     with gr.Row():
                 placeholder="e.g., 4x NVIDIA A100 80GB"
             )
             csv_file = gr.File(
+                label="Prediction TXT*",
+                file_types=[".txt"],
                 type="filepath"
             )
             paper_link = gr.Textbox(
     submit_btn = gr.Button("Submit", variant="primary")
     result = gr.Textbox(label="Submission Status", interactive=False)
     # Set up event handlers
     model_type_filter.change(
         update_tables,
         inputs=[model_type_filter],
         outputs=all_dfs
     )
 # Launch the application
 demo.launch()