IL-TUR-Leaderboard

Running

App Files Files Community

abhinav-joshi commited on Jun 29, 2024

Commit

2b8f89d

1 Parent(s): eb68762

add baseline results

Browse files

Files changed (11) hide show

.DS_Store +0 -0
app.py +313 -28
dummy.py +15 -0
submissions/.DS_Store +0 -0
submissions/baseline/baseline -pre2.csv +12 -0
submissions/baseline/baseline-pre.csv +7 -0
submissions/baseline/baseline.csv +11 -7
submissions/baseline/results-bacup.json +133 -0
submissions/baseline/results.json +133 -0
submissions/baseline/submission.json +16 -0
uploads.py +53 -34

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

app.py CHANGED Viewed

@@ -7,15 +7,14 @@ from uploads import add_new_eval
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""@inproceedings{iltur-2024,
-    title = "IL-TUR: Benchmark for Indian Legal Text Understanding and Reasoning",
-    author = "Joshi, Abhinav  and Paul, Shaunak Sharma, Akshat  and Goyal, Pawan  and   Ghosh, Saptarshi and Modi, Ashutosh",
-    booktitle = "Proceedings of the 62st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
-    month = aug,
-    year = "2024",
-    address = "Bangkok, Thailand",
-    publisher = "Association for Computational Linguistics",
-}
-}"""
 api = HfApi()
 TOKEN = os.environ.get("TOKEN", None)
@@ -27,7 +26,7 @@ def restart_space():
 # Function to load data from a given CSV file
-def baseline_load_data(tasks):
     # version = version.replace("%", "p")
     file_path = f"submissions/baseline/baseline.csv"  # Replace with your file paths
     df = pd.read_csv(file_path)
@@ -46,6 +45,20 @@ def baseline_load_data(tasks):
         "SUMM",
         "Average",
     ]
     if tasks is None:
         breakpoint()
     # based on the tasks, remove the columns that are not needed
@@ -65,14 +78,77 @@ def baseline_load_data(tasks):
         column_names.remove("SUMM")
     df = df[column_names]
-    df = df.sort_values(by="Average", ascending=False)
     df = df.drop_duplicates(subset=["Method"], keep="first")
     return df
-def load_data(tasks):
-    baseline_df = baseline_load_data(tasks)
     return baseline_df
@@ -86,8 +162,29 @@ def search_leaderboard(df, query):
 # Function to change the version of the leaderboard
-def change_version(tasks):
-    new_df = load_data(tasks)
     return new_df
@@ -120,6 +217,57 @@ with demo:
                     label="Select Tasks",
                     choices=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
                     value=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
                 )
             with gr.Row():
@@ -128,10 +276,22 @@ with demo:
                     show_label=False,
                 )
             leaderboard_table = gr.components.Dataframe(
                 value=load_data(
                     # "baseline",
                     ["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
                 ),
                 interactive=True,
                 visible=True,
@@ -151,31 +311,156 @@ with demo:
             search_bar.change(
                 search_leaderboard,
                 inputs=[
-                    leaderboard_table,
-                    search_bar,
-                    # tasks_checkbox
                 ],
                 outputs=leaderboard_table,
             )
             tasks_checkbox.change(
                 change_version,
-                inputs=[tasks_checkbox],
                 outputs=leaderboard_table,
             )
-    with gr.Accordion("Submit a new model for evaluation"):
         with gr.Row():
             with gr.Column():
-                method_name_textbox = gr.Textbox(label="Method name")
-                url_textbox = gr.Textbox(label="Url to model information")
-            with gr.Column():
                 organisation = gr.Textbox(label="Organisation")
                 mail = gr.Textbox(label="Contact email")
                 file_output = gr.File()
-        submit_button = gr.Button("Submit Eval")
         submission_result = gr.Markdown()
         submit_button.click(
             add_new_eval,
@@ -221,5 +506,5 @@ with demo:
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=3600)
 scheduler.start()
-# demo.launch(debug=True)
-demo.launch(share=True)

 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""@inproceedings{iltur-2024,
+      title = "IL-TUR: Benchmark for Indian Legal Text Understanding and Reasoning",
+      author = "Joshi, Abhinav and Paul, Shounak and Sharma, Akshat and Goyal, Pawan and Ghosh, Saptarshi and Modi, Ashutosh"
+      booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
+      month = aug,
+      year = "2024",
+      address = "Bangkok, Thailand",
+      publisher = "Association for Computational Linguistics",
+  }"""
 api = HfApi()
 TOKEN = os.environ.get("TOKEN", None)
 # Function to load data from a given CSV file
+def baseline_load_data(tasks, task_metrics):
     # version = version.replace("%", "p")
     file_path = f"submissions/baseline/baseline.csv"  # Replace with your file paths
     df = pd.read_csv(file_path)
         "SUMM",
         "Average",
     ]
+    # Method,Submitted by,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,L-MT
+    column_names = [
+        "Method",
+        "Submitted By",
+        "L-NER",
+        "RR",
+        "CJPE",
+        "BAIL",
+        "LSI",
+        "PCR",
+        "SUMM",
+        # "Average",
+    ]
     if tasks is None:
         breakpoint()
     # based on the tasks, remove the columns that are not needed
         column_names.remove("SUMM")
     df = df[column_names]
+    import json
+    # load the results json file
+    with open("submissions/baseline/results.json") as f:
+        results = json.load(f)
+    # add the results to the dataframe
+    # Method,Submitted By,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,L-MT
+    # Metric,-,strict mF1,mF1,mF1|ROUGE-L|BLEU,mF1,mF1,muF1@K,ROUGE-L|BERTSCORE,BLEU|GLEU|chrF++
+    # create a new df to display the results
+    results_df = pd.DataFrame(
+        columns=[
+            "Method",
+            "Submitted By",
+            "Github Link",
+            "L-NER",
+            "RR",
+            "CJPE",
+            "BAIL",
+            "LSI",
+            "PCR",
+            "SUMM",
+            "L-MT",
+            # "Average",
+        ]
+    )
+    # breakpoint()
+    for entry in results:
+        results_df = results_df.append(
+            {
+                "Method": entry["Method"],
+                "Submitted By": entry["Submitted By"],
+                "Github Link": entry["Github Link"],
+                "L-NER": entry["L-NER"][task_metrics["L-NER"]],
+                "RR": entry["RR"][task_metrics["RR"]],
+                "CJPE": entry["CJPE"][task_metrics["CJPE"]],
+                "BAIL": entry["BAIL"][task_metrics["BAIL"]],
+                "LSI": entry["LSI"][task_metrics["LSI"]],
+                "PCR": entry["PCR"][task_metrics["PCR"]],
+                "SUMM": entry["SUMM"][task_metrics["SUMM"]],
+                "L-MT": entry["L-MT"][task_metrics["L-MT"]],
+                # "Average": ,
+            },
+            ignore_index=True,
+        )
+    # breakpoint()
+    # add the average column
+    # results_df["Average"] = results_df.mean(axis=1)
+    df = results_df
+    # df = df.sort_values(by="Average", ascending=False)
+    # remove the columns that are not in tasks
+    selected_columns = (
+        [
+            "Method",
+            "Submitted By",
+        ]
+        + tasks
+        + ["Github Link"]
+    )
+    print(tasks)
+    df = df[selected_columns]
     df = df.drop_duplicates(subset=["Method"], keep="first")
     return df
+def load_data(tasks, task_metrics):
+    baseline_df = baseline_load_data(tasks, task_metrics)
     return baseline_df
 # Function to change the version of the leaderboard
+def change_version(
+    tasks,
+    l_ner_metric,
+    rr_metric,
+    cjpe_metric,
+    bail_metric,
+    lsi_metric,
+    pcr_metric,
+    summ_metric,
+    lmt_metric,
+):
+    task_metrics = {
+        "L-NER": l_ner_metric,
+        "RR": rr_metric,
+        "CJPE": cjpe_metric,
+        "BAIL": bail_metric,
+        "LSI": lsi_metric,
+        "PCR": pcr_metric,
+        "SUMM": summ_metric,
+        "L-MT": lmt_metric,
+    }
+    new_df = load_data(tasks, task_metrics)
     return new_df
                     label="Select Tasks",
                     choices=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
                     value=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
+                    interactive=True,
+                )
+            with gr.Row():
+                l_ner_metric = gr.Radio(
+                    label="L-NER",
+                    choices=["strict mF1"],
+                    value="strict mF1",
+                    interactive=True,
+                )
+                rr_metric = gr.Radio(
+                    label="RR",
+                    choices=["mF1"],
+                    value="mF1",
+                    interactive=True,
+                )
+                cjpe_metric = gr.Radio(
+                    label="CJPE",
+                    choices=["mF1", "ROUGE-L", "BLEU"],
+                    value="mF1",
+                    interactive=True,
+                )
+                bail_metric = gr.Radio(
+                    label="BAIL",
+                    choices=["mF1"],
+                    value="mF1",
+                    interactive=True,
+                )
+                lsi_metric = gr.Radio(
+                    label="LSI",
+                    choices=["mF1"],
+                    value="mF1",
+                    interactive=True,
+                )
+                pcr_metric = gr.Radio(
+                    label="PCR",
+                    choices=["muF1@K"],
+                    value="muF1@K",
+                    interactive=True,
+                )
+                summ_metric = gr.Radio(
+                    label="SUMM",
+                    choices=["ROUGE-L", "BERTSCORE"],
+                    value="ROUGE-L",
+                    interactive=True,
+                )
+                lmt_metric = gr.Radio(
+                    label="L-MT",
+                    choices=["BLEU", "GLEU", "chrF++"],
+                    value="BLEU",
+                    interactive=True,
                 )
             with gr.Row():
                     show_label=False,
                 )
+            task_metrics = {
+                "L-NER": l_ner_metric.value,
+                "RR": rr_metric.value,
+                "CJPE": cjpe_metric.value,
+                "BAIL": bail_metric.value,
+                "LSI": lsi_metric.value,
+                "PCR": pcr_metric.value,
+                "SUMM": summ_metric.value,
+                "L-MT": lmt_metric.value,
+            }
             leaderboard_table = gr.components.Dataframe(
                 value=load_data(
                     # "baseline",
                     ["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
+                    task_metrics=task_metrics,
                 ),
                 interactive=True,
                 visible=True,
             search_bar.change(
                 search_leaderboard,
+                inputs=[leaderboard_table, search_bar],
+                outputs=leaderboard_table,
+            )
+            # breakpoint()
+            l_ner_metric.change(
+                change_version,
                 inputs=[
+                    tasks_checkbox,
+                    l_ner_metric,
+                    rr_metric,
+                    cjpe_metric,
+                    bail_metric,
+                    lsi_metric,
+                    pcr_metric,
+                    summ_metric,
+                    lmt_metric,
+                ],
+                outputs=leaderboard_table,
+            )
+            rr_metric.change(
+                change_version,
+                inputs=[
+                    tasks_checkbox,
+                    l_ner_metric,
+                    rr_metric,
+                    cjpe_metric,
+                    bail_metric,
+                    lsi_metric,
+                    pcr_metric,
+                    summ_metric,
+                    lmt_metric,
+                ],
+                outputs=leaderboard_table,
+            )
+            cjpe_metric.change(
+                change_version,
+                inputs=[
+                    tasks_checkbox,
+                    l_ner_metric,
+                    rr_metric,
+                    cjpe_metric,
+                    bail_metric,
+                    lsi_metric,
+                    pcr_metric,
+                    summ_metric,
+                    lmt_metric,
+                ],
+                outputs=leaderboard_table,
+            )
+            bail_metric.change(
+                change_version,
+                inputs=[
+                    tasks_checkbox,
+                    l_ner_metric,
+                    rr_metric,
+                    cjpe_metric,
+                    bail_metric,
+                    lsi_metric,
+                    pcr_metric,
+                    summ_metric,
+                    lmt_metric,
+                ],
+                outputs=leaderboard_table,
+            )
+            lsi_metric.change(
+                change_version,
+                inputs=[
+                    tasks_checkbox,
+                    l_ner_metric,
+                    rr_metric,
+                    cjpe_metric,
+                    bail_metric,
+                    lsi_metric,
+                    pcr_metric,
+                    summ_metric,
+                    lmt_metric,
+                ],
+                outputs=leaderboard_table,
+            )
+            pcr_metric.change(
+                change_version,
+                inputs=[
+                    tasks_checkbox,
+                    l_ner_metric,
+                    rr_metric,
+                    cjpe_metric,
+                    bail_metric,
+                    lsi_metric,
+                    pcr_metric,
+                    summ_metric,
+                    lmt_metric,
+                ],
+                outputs=leaderboard_table,
+            )
+            summ_metric.change(
+                change_version,
+                inputs=[
+                    tasks_checkbox,
+                    l_ner_metric,
+                    rr_metric,
+                    cjpe_metric,
+                    bail_metric,
+                    lsi_metric,
+                    pcr_metric,
+                    summ_metric,
+                    lmt_metric,
+                ],
+                outputs=leaderboard_table,
+            )
+            lmt_metric.change(
+                change_version,
+                inputs=[
+                    tasks_checkbox,
+                    l_ner_metric,
+                    rr_metric,
+                    cjpe_metric,
+                    bail_metric,
+                    lsi_metric,
+                    pcr_metric,
+                    summ_metric,
+                    lmt_metric,
                 ],
                 outputs=leaderboard_table,
             )
             tasks_checkbox.change(
                 change_version,
+                inputs=[
+                    tasks_checkbox,
+                    l_ner_metric,
+                    rr_metric,
+                    cjpe_metric,
+                    bail_metric,
+                    lsi_metric,
+                    pcr_metric,
+                    summ_metric,
+                    lmt_metric,
+                ],
                 outputs=leaderboard_table,
             )
+    with gr.Accordion("Submit the results of your Method"):
         with gr.Row():
             with gr.Column():
+                method_name_textbox = gr.Textbox(label="Method")
+                url_textbox = gr.Textbox(label="Github Link")
                 organisation = gr.Textbox(label="Organisation")
                 mail = gr.Textbox(label="Contact email")
+            with gr.Column():
                 file_output = gr.File()
+                submit_button = gr.Button("Submit Eval")
         submission_result = gr.Markdown()
         submit_button.click(
             add_new_eval,
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=3600)
 scheduler.start()
+demo.launch(debug=True)
+# demo.launch(share=True)

dummy.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import json
+# load the results json file
+with open("submissions/baseline/results.json") as f:
+    results = json.load(f)
+# update the results
+with open("submissions/baseline/submission.json") as f:
+    submission = json.load(f)
+breakpoint()
+# update the results
+results.append(submission[0])

submissions/.DS_Store CHANGED Viewed

Binary files a/submissions/.DS_Store and b/submissions/.DS_Store differ

submissions/baseline/baseline -pre2.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+Method,Submitted By,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,L-MT
+Metric,-,strict mF1,mF1,mF1|ROUGE-L|BLEU,mF1,mF1,muF1@K,ROUGE-L|BERTSCORE,BLEU|GLEU|chrF++
+SOTA,various,48.58,69.01,81.31|56.00|32.00,81,28.08,39.15,33.00|86.00,28.00|32.00|57.00
+BERT,various,39.59,58,71.14|-|-,-,18.44,9.24,-|-,-|-|-
+LegalBERT,various,45.58,54,78.21|-|-,-,21.74,8.67,-|-,-|-|-
+InLegalBERT,various,48.58,58,81.31|-|-,-,26.23,7.57,-|-,-|-|-
+GPT-3.5 (0-shot),IL-TUR team,30.59,30.95,54.17|30.00|8.00,51.04,21.55,-,21.00|85.00,23.00|28.00|42.00
+GPT-3.5 (1-shot),IL-TUR team,23.68,30.05,51.46|29.00|15.00,46.35,22.61,-,20.00|84.00,25.00|28.00|43.00
+GPT-3.5 (2-shot),IL-TUR team,32.84,30.31,56.74|30.00|11.00,61,21.4,-,22.00|84.00,26.00|29.00|43.00
+GPT-4 (0-shot),IL-TUR team,13.65,37.37,68.29|40.00|14.00,51.46,23.99,-,23.00|85.00,33.00|36.00|50.00
+GPT-4 (1-shot),IL-TUR team,10.51,37.43,47.26|39.00|16.00,56.9,22.26,-,16.00|81.00,35.00|38.00|52.00
+GPT-4 (2-shot),IL-TUR team,24.03,38.18,60.44|43.00|18.00,66.67,20.53,-,17.00|81.00,36.00|39.00|53.00

submissions/baseline/baseline-pre.csv ADDED Viewed

	@@ -0,0 +1,7 @@

+Unnamed: 0,index,Method,Submitted By,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,Average
+,0,baseline,baseline,0,0,0,0,0,0,0,0
+,0,baseline2,baseline2,0,0,0,0,0,0,0,0
+,0,baseline,baseline,0,0,0,0,0,0,0,0
+,0,random,random,0,0,0,0,0,0,0,0
+,0,random2,random22,0,0,0,0,0,0,0,0
+,0,random5,random55,0,0,0,0,0,0,0,0

submissions/baseline/baseline.csv CHANGED Viewed

@@ -1,7 +1,11 @@
-Unnamed: 0,index,Method,Submitted By,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,Average
-,0,baseline,baseline,0,0,0,0,0,0,0,0
-,0,baseline2,baseline2,0,0,0,0,0,0,0,0
-,0,baseline,baseline,0,0,0,0,0,0,0,0
-,0,random,random,0,0,0,0,0,0,0,0
-,0,random2,random22,0,0,0,0,0,0,0,0
-,0,random5,random55,0,0,0,0,0,0,0,0

+Method,Submitted By,L-NER strict mF1,RR mF1,CJPE mF1,CJPE ROUGE-L,CJPE BLEU,BAIL mF1,LSI mF1,PCR muF1@K,SUMM ROUGE-L,SUMM BERTSCORE,L-MT BLEU,L-MT GLEU,L-MT chrF++
+SOTA,various,48.58,69.01,81.31,56.00,32.00,81,28.08,39.15,33.00,86.00,28.00,32.00,57.00
+BERT,various,39.59,58,71.14,-,-,-,18.44,9.24,-,-,-,-,-
+LegalBERT,various,45.58,54,78.21,-,-,-,21.74,8.67,-,-,-,-,-
+InLegalBERT,various,48.58,58,81.31,-,-,-,26.23,7.57,-,-,-,-,-
+GPT-3.5 (0-shot),IL-TUR team,30.59,30.95,54.17,30.00,8.00,51.04,21.55,-,21.00,85.00,23.00,28.00,42.00
+GPT-3.5 (1-shot),IL-TUR team,23.68,30.05,51.46,29.00,15.00,46.35,22.61,-,20.00,84.00,25.00,28.00,43.00
+GPT-3.5 (2-shot),IL-TUR team,32.84,30.31,56.74,30.00,11.00,61,21.4,-,22.00,84.00,26.00,29.00,43.00
+GPT-4 (0-shot),IL-TUR team,13.65,37.37,68.29,40.00,14.00,51.46,23.99,-,23.00,85.00,33.00,36.00,50.00
+GPT-4 (1-shot),IL-TUR team,10.51,37.43,47.26,39.00,16.00,56.9,22.26,-,16.00,81.00,35.00,38.00,52.00
+GPT-4 (2-shot),IL-TUR team,24.03,38.18,60.44,43.00,18.00,66.67,20.53,-,17.00,81.00,36.00,39.00,53.00

submissions/baseline/results-bacup.json ADDED Viewed

	@@ -0,0 +1,133 @@

+[
+    {
+      "Method": "SOTA",
+      "Submitted By": "multiple",
+      "Github Link": "exploration-lab.github.io/IL-TUR/",
+      "L-NER": {"strict mF1": "48.58"},
+      "RR": {"mF1": "69.01"},
+      "CJPE": {"mF1": "81.31", "ROUGE-L": "56.00", "BLEU": "32.00"},
+      "BAIL": {"mF1": "81"},
+      "LSI": {"mF1": "28.08"},
+      "PCR": {"muF1@K": "39.15"},
+      "SUMM": {"ROUGE-L": "33.00", "BERTSCORE": "86.00"},
+      "L-MT": {"BLEU": "28.00", "GLEU": "32.00", "chrF++": "57.00"}
+    },
+    {
+      "Method": "BERT",
+      "Submitted By": "multiple",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "39.59"},
+      "RR": {"mF1": "58"},
+      "CJPE": {"mF1": "71.14", "ROUGE-L": "-", "BLEU": "-"},
+      "BAIL": {"mF1": "-"},
+      "LSI": {"mF1": "-"},
+      "PCR": {"muF1@K": "18.44"},
+      "SUMM": {"ROUGE-L": "9.24", "BERTSCORE": "-"},
+      "L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
+    },
+    {
+      "Method": "LegalBERT",
+      "Submitted By": "multiple",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "45.58"},
+      "RR": {"mF1": "54"},
+      "CJPE": {"mF1": "78.21", "ROUGE-L": "-", "BLEU": "-"},
+      "BAIL": {"mF1": "-"},
+      "LSI": {"mF1": "-"},
+      "PCR": {"muF1@K": "21.74"},
+      "SUMM": {"ROUGE-L": "8.67", "BERTSCORE": "-"},
+      "L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
+    },
+    {
+      "Method": "InLegalBERT",
+      "Submitted By": "multiple",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "48.58"},
+      "RR": {"mF1": "58"},
+      "CJPE": {"mF1": "81.31", "ROUGE-L": "-", "BLEU": "-"},
+      "BAIL": {"mF1": "-"},
+      "LSI": {"mF1": "-"},
+      "PCR": {"muF1@K": "26.23"},
+      "SUMM": {"ROUGE-L": "7.57", "BERTSCORE": "-"},
+      "L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
+    },
+    {
+      "Method": "GPT-3.5 (0-shot)",
+      "Submitted By": "IL-TUR",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "30.59"},
+      "RR": {"mF1": "30.95"},
+      "CJPE": {"mF1": "54.17", "ROUGE-L": "30.00", "BLEU": "8.00"},
+      "BAIL": {"mF1": "51.04"},
+      "LSI": {"mF1": "21.55"},
+      "PCR": {"muF1@K": "-"},
+      "SUMM": {"ROUGE-L": "21.00", "BERTSCORE": "85.00"},
+      "L-MT": {"BLEU": "23.00", "GLEU": "28.00", "chrF++": "42.00"}
+    },
+    {
+      "Method": "GPT-3.5 (1-shot)",
+      "Submitted By": "IL-TUR",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "23.68"},
+      "RR": {"mF1": "30.05"},
+      "CJPE": {"mF1": "51.46", "ROUGE-L": "29.00", "BLEU": "15.00"},
+      "BAIL": {"mF1": "46.35"},
+      "LSI": {"mF1": "22.61"},
+      "PCR": {"muF1@K": "-"},
+      "SUMM": {"ROUGE-L": "20.00", "BERTSCORE": "84.00"},
+      "L-MT": {"BLEU": "25.00", "GLEU": "28.00", "chrF++": "43.00"}
+    },
+    {
+      "Method": "GPT-3.5 (2-shot)",
+      "Submitted By": "IL-TUR",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "32.84"},
+      "RR": {"mF1": "30.31"},
+      "CJPE": {"mF1": "56.74", "ROUGE-L": "30.00", "BLEU": "11.00"},
+      "BAIL": {"mF1": "61"},
+      "LSI": {"mF1": "21.4"},
+      "PCR": {"muF1@K": "-"},
+      "SUMM": {"ROUGE-L": "22.00", "BERTSCORE": "84.00"},
+      "L-MT": {"BLEU": "26.00", "GLEU": "29.00", "chrF++": "43.00"}
+    },
+    {
+      "Method": "GPT-4 (0-shot)",
+      "Submitted By": "IL-TUR",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "13.65"},
+      "RR": {"mF1": "37.37"},
+      "CJPE": {"mF1": "68.29", "ROUGE-L": "40.00", "BLEU": "14.00"},
+      "BAIL": {"mF1": "51.46"},
+      "LSI": {"mF1": "23.99"},
+      "PCR": {"muF1@K": "-"},
+      "SUMM": {"ROUGE-L": "23.00", "BERTSCORE": "85.00"},
+      "L-MT": {"BLEU": "33.00", "GLEU": "36.00", "chrF++": "50.00"}
+    },
+    {
+      "Method": "GPT-4 (1-shot)",
+      "Submitted By": "IL-TUR",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "10.51"},
+      "RR": {"mF1": "37.43"},
+      "CJPE": {"mF1": "47.26", "ROUGE-L": "39.00", "BLEU": "16.00"},
+      "BAIL": {"mF1": "56.9"},
+      "LSI": {"mF1": "22.26"},
+      "PCR": {"muF1@K": "-"},
+      "SUMM": {"ROUGE-L": "16.00", "BERTSCORE": "81.00"},
+      "L-MT": {"BLEU": "35.00", "GLEU": "38.00", "chrF++": "52.00"}
+    },
+    {
+      "Method": "GPT-4 (2-shot)",
+      "Submitted By": "IL-TUR",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "24.03"},
+      "RR": {"mF1": "38.18"},
+      "CJPE": {"mF1": "60.44", "ROUGE-L": "43.00", "BLEU": "18.00"},
+      "BAIL": {"mF1": "66.67"},
+      "LSI": {"mF1": "20.53"},
+      "PCR": {"muF1@K": "-"},
+      "SUMM": {"ROUGE-L": "17.00", "BERTSCORE": "81.00"},
+      "L-MT": {"BLEU": "36.00", "GLEU": "39.00", "chrF++": "53.00"}
+    }
+  ]

submissions/baseline/results.json ADDED Viewed

	@@ -0,0 +1,133 @@

+[
+    {
+      "Method": "SOTA",
+      "Submitted By": "multiple",
+      "Github Link": "exploration-lab.github.io/IL-TUR/",
+      "L-NER": {"strict mF1": "48.58"},
+      "RR": {"mF1": "69.01"},
+      "CJPE": {"mF1": "81.31", "ROUGE-L": "56.00", "BLEU": "32.00"},
+      "BAIL": {"mF1": "81"},
+      "LSI": {"mF1": "28.08"},
+      "PCR": {"muF1@K": "39.15"},
+      "SUMM": {"ROUGE-L": "33.00", "BERTSCORE": "86.00"},
+      "L-MT": {"BLEU": "28.00", "GLEU": "32.00", "chrF++": "57.00"}
+    },
+    {
+      "Method": "BERT",
+      "Submitted By": "multiple",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "39.59"},
+      "RR": {"mF1": "58"},
+      "CJPE": {"mF1": "71.14", "ROUGE-L": "-", "BLEU": "-"},
+      "BAIL": {"mF1": "-"},
+      "LSI": {"mF1": "-"},
+      "PCR": {"muF1@K": "18.44"},
+      "SUMM": {"ROUGE-L": "9.24", "BERTSCORE": "-"},
+      "L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
+    },
+    {
+      "Method": "LegalBERT",
+      "Submitted By": "multiple",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "45.58"},
+      "RR": {"mF1": "54"},
+      "CJPE": {"mF1": "78.21", "ROUGE-L": "-", "BLEU": "-"},
+      "BAIL": {"mF1": "-"},
+      "LSI": {"mF1": "-"},
+      "PCR": {"muF1@K": "21.74"},
+      "SUMM": {"ROUGE-L": "8.67", "BERTSCORE": "-"},
+      "L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
+    },
+    {
+      "Method": "InLegalBERT",
+      "Submitted By": "multiple",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "48.58"},
+      "RR": {"mF1": "58"},
+      "CJPE": {"mF1": "81.31", "ROUGE-L": "-", "BLEU": "-"},
+      "BAIL": {"mF1": "-"},
+      "LSI": {"mF1": "-"},
+      "PCR": {"muF1@K": "26.23"},
+      "SUMM": {"ROUGE-L": "7.57", "BERTSCORE": "-"},
+      "L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
+    },
+    {
+      "Method": "GPT-3.5 (0-shot)",
+      "Submitted By": "IL-TUR",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "30.59"},
+      "RR": {"mF1": "30.95"},
+      "CJPE": {"mF1": "54.17", "ROUGE-L": "30.00", "BLEU": "8.00"},
+      "BAIL": {"mF1": "51.04"},
+      "LSI": {"mF1": "21.55"},
+      "PCR": {"muF1@K": "-"},
+      "SUMM": {"ROUGE-L": "21.00", "BERTSCORE": "85.00"},
+      "L-MT": {"BLEU": "23.00", "GLEU": "28.00", "chrF++": "42.00"}
+    },
+    {
+      "Method": "GPT-3.5 (1-shot)",
+      "Submitted By": "IL-TUR",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "23.68"},
+      "RR": {"mF1": "30.05"},
+      "CJPE": {"mF1": "51.46", "ROUGE-L": "29.00", "BLEU": "15.00"},
+      "BAIL": {"mF1": "46.35"},
+      "LSI": {"mF1": "22.61"},
+      "PCR": {"muF1@K": "-"},
+      "SUMM": {"ROUGE-L": "20.00", "BERTSCORE": "84.00"},
+      "L-MT": {"BLEU": "25.00", "GLEU": "28.00", "chrF++": "43.00"}
+    },
+    {
+      "Method": "GPT-3.5 (2-shot)",
+      "Submitted By": "IL-TUR",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "32.84"},
+      "RR": {"mF1": "30.31"},
+      "CJPE": {"mF1": "56.74", "ROUGE-L": "30.00", "BLEU": "11.00"},
+      "BAIL": {"mF1": "61"},
+      "LSI": {"mF1": "21.4"},
+      "PCR": {"muF1@K": "-"},
+      "SUMM": {"ROUGE-L": "22.00", "BERTSCORE": "84.00"},
+      "L-MT": {"BLEU": "26.00", "GLEU": "29.00", "chrF++": "43.00"}
+    },
+    {
+      "Method": "GPT-4 (0-shot)",
+      "Submitted By": "IL-TUR",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "13.65"},
+      "RR": {"mF1": "37.37"},
+      "CJPE": {"mF1": "68.29", "ROUGE-L": "40.00", "BLEU": "14.00"},
+      "BAIL": {"mF1": "51.46"},
+      "LSI": {"mF1": "23.99"},
+      "PCR": {"muF1@K": "-"},
+      "SUMM": {"ROUGE-L": "23.00", "BERTSCORE": "85.00"},
+      "L-MT": {"BLEU": "33.00", "GLEU": "36.00", "chrF++": "50.00"}
+    },
+    {
+      "Method": "GPT-4 (1-shot)",
+      "Submitted By": "IL-TUR",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "10.51"},
+      "RR": {"mF1": "37.43"},
+      "CJPE": {"mF1": "47.26", "ROUGE-L": "39.00", "BLEU": "16.00"},
+      "BAIL": {"mF1": "56.9"},
+      "LSI": {"mF1": "22.26"},
+      "PCR": {"muF1@K": "-"},
+      "SUMM": {"ROUGE-L": "16.00", "BERTSCORE": "81.00"},
+      "L-MT": {"BLEU": "35.00", "GLEU": "38.00", "chrF++": "52.00"}
+    },
+    {
+      "Method": "GPT-4 (2-shot)",
+      "Submitted By": "IL-TUR",
+      "Github Link": "",
+      "L-NER": {"strict mF1": "24.03"},
+      "RR": {"mF1": "38.18"},
+      "CJPE": {"mF1": "60.44", "ROUGE-L": "43.00", "BLEU": "18.00"},
+      "BAIL": {"mF1": "66.67"},
+      "LSI": {"mF1": "20.53"},
+      "PCR": {"muF1@K": "-"},
+      "SUMM": {"ROUGE-L": "17.00", "BERTSCORE": "81.00"},
+      "L-MT": {"BLEU": "36.00", "GLEU": "39.00", "chrF++": "53.00"}
+    }
+  ]

submissions/baseline/submission.json ADDED Viewed

	@@ -0,0 +1,16 @@

+[
+    {
+      "Method": "GPT-5 (2-shot)",
+      "Submitted By": "IL-TUR",
+      "Github Link": "dummy submission",
+      "L-NER": {"strict mF1": "24.03"},
+      "RR": {"mF1": "38.18"},
+      "CJPE": {"mF1": "60.44", "ROUGE-L": "43.00", "BLEU": "18.00"},
+      "BAIL": {"mF1": "66.67"},
+      "LSI": {"mF1": "20.53"},
+      "PCR": {"muF1@K": "-"},
+      "SUMM": {"ROUGE-L": "17.00", "BERTSCORE": "81.00"},
+      "L-MT": {"BLEU": "36.00", "GLEU": "39.00", "chrF++": "53.00"}
+    }
+  ]

uploads.py CHANGED Viewed

@@ -2,6 +2,7 @@ from email.utils import parseaddr
 from huggingface_hub import HfApi
 import os
 import datetime
 import pandas as pd
 LEADERBOARD_PATH = "Exploration-Lab/IL-TUR-Leaderboard"
@@ -59,54 +60,72 @@ def add_new_eval(
         mail,
     )
-    # load the file
-    df = pd.read_csv(path_to_file)
-    submission_df = pd.read_csv(path_to_file)
-    # modify the df to include metadata
-    df["Method"] = method_name
-    df["url"] = url
-    df["organisation"] = organisation
-    df["mail"] = parsed_mail
-    df["timestamp"] = datetime.datetime.now()
-    submission_df = pd.read_csv(path_to_file)
-    submission_df["Method"] = method_name
-    submission_df["Submitted By"] = organisation
-    # upload to spaces using the hf api at
-    path_in_repo = f"submissions/{method_name}"
-    file_name = f"{method_name}-{organisation}-{datetime.datetime.now().strftime('%Y-%m-%d')}.csv"
     # upload the df to spaces
     import io
-    buffer = io.BytesIO()
-    df.to_csv(buffer, index=False)  # Write the DataFrame to a buffer in CSV format
-    buffer.seek(0)  # Rewind the buffer to the beginning
-    api.upload_file(
-        repo_id=RESULTS_PATH,
-        path_in_repo=f"{path_in_repo}/{file_name}",
-        path_or_fileobj=buffer,
-        token=TOKEN,
-        repo_type="dataset",
-    )
-    # read the leaderboard
-    leaderboard_df = pd.read_csv(f"submissions/baseline/baseline.csv")
-    # append the new submission_df csv to the leaderboard
-    # leaderboard_df = leaderboard_df._append(submission_df)
-    leaderboard_df = pd.concat([leaderboard_df, submission_df], ignore_index=True)
-    # save the new leaderboard
-    # leaderboard_df.to_csv(f"submissions/baseline/baseline.csv", index=False)
     leaderboard_buffer = io.BytesIO()
-    leaderboard_df.to_csv(leaderboard_buffer, index=False)
     leaderboard_buffer.seek(0)
     api.upload_file(
         repo_id=LEADERBOARD_PATH,
-        path_in_repo=f"submissions/baseline/baseline.csv",
         path_or_fileobj=leaderboard_buffer,
         token=TOKEN,
         repo_type="space",

 from huggingface_hub import HfApi
 import os
 import datetime
+import json
 import pandas as pd
 LEADERBOARD_PATH = "Exploration-Lab/IL-TUR-Leaderboard"
         mail,
     )
+    # # load the file
+    # df = pd.read_csv(path_to_file)
+    # submission_df = pd.read_csv(path_to_file)
+    # # modify the df to include metadata
+    # df["Method"] = method_name
+    # df["url"] = url
+    # df["organisation"] = organisation
+    # df["mail"] = parsed_mail
+    # df["timestamp"] = datetime.datetime.now()
+    # submission_df = pd.read_csv(path_to_file)
+    # submission_df["Method"] = method_name
+    # submission_df["Submitted By"] = organisation
+    # # upload to spaces using the hf api at
+    # path_in_repo = f"submissions/{method_name}"
+    # file_name = f"{method_name}-{organisation}-{datetime.datetime.now().strftime('%Y-%m-%d')}.csv"
     # upload the df to spaces
     import io
+    # read the submission json file
+    with open(path_to_file, "r") as f:
+        submission = json.load(f)
+    with open("submissions/baseline/results.json", "r") as f:
+        results = json.load(f)
+    # update the results
+    results.append(submission[0])
     leaderboard_buffer = io.BytesIO()
+    # df.to_csv(buffer, index=False)  # Write the DataFrame to a buffer in CSV format
+    # buffer.seek(0)  # Rewind the buffer to the beginning
+    # save the results to buffer
+    leaderboard_buffer.write(json.dumps(results).encode())
     leaderboard_buffer.seek(0)
+    # api.upload_file(
+    #     repo_id=RESULTS_PATH,
+    #     path_in_repo=f"{path_in_repo}/{file_name}",
+    #     path_or_fileobj=buffer,
+    #     token=TOKEN,
+    #     repo_type="dataset",
+    # )
+    # # read the leaderboard
+    # leaderboard_df = pd.read_csv(f"submissions/baseline/baseline.csv")
+    # # append the new submission_df csv to the leaderboard
+    # # leaderboard_df = leaderboard_df._append(submission_df)
+    # # leaderboard_df = pd.concat([leaderboard_df, submission_df], ignore_index=True)
+    # # save the new leaderboard
+    # # leaderboard_df.to_csv(f"submissions/baseline/baseline.csv", index=False)
+    # leaderboard_buffer = io.BytesIO()
+    # leaderboard_df.to_csv(leaderboard_buffer, index=False)
+    # leaderboard_buffer.seek(0)
+    # with open("submissions/baseline/results.json", "w") as f:
+    #     json.dump(results, f)
     api.upload_file(
         repo_id=LEADERBOARD_PATH,
+        # path_in_repo=f"submissions/baseline/baseline.csv",
+        path_in_repo=f"submissions/results.json",
         path_or_fileobj=leaderboard_buffer,
         token=TOKEN,
         repo_type="space",