Commit
·
2b8f89d
1
Parent(s):
eb68762
add baseline results
Browse files- .DS_Store +0 -0
- app.py +313 -28
- dummy.py +15 -0
- submissions/.DS_Store +0 -0
- submissions/baseline/baseline -pre2.csv +12 -0
- submissions/baseline/baseline-pre.csv +7 -0
- submissions/baseline/baseline.csv +11 -7
- submissions/baseline/results-bacup.json +133 -0
- submissions/baseline/results.json +133 -0
- submissions/baseline/submission.json +16 -0
- uploads.py +53 -34
.DS_Store
CHANGED
|
Binary files a/.DS_Store and b/.DS_Store differ
|
|
|
app.py
CHANGED
|
@@ -7,15 +7,14 @@ from uploads import add_new_eval
|
|
| 7 |
|
| 8 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 9 |
CITATION_BUTTON_TEXT = r"""@inproceedings{iltur-2024,
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
}
|
| 18 |
-
}"""
|
| 19 |
|
| 20 |
api = HfApi()
|
| 21 |
TOKEN = os.environ.get("TOKEN", None)
|
|
@@ -27,7 +26,7 @@ def restart_space():
|
|
| 27 |
|
| 28 |
|
| 29 |
# Function to load data from a given CSV file
|
| 30 |
-
def baseline_load_data(tasks):
|
| 31 |
# version = version.replace("%", "p")
|
| 32 |
file_path = f"submissions/baseline/baseline.csv" # Replace with your file paths
|
| 33 |
df = pd.read_csv(file_path)
|
|
@@ -46,6 +45,20 @@ def baseline_load_data(tasks):
|
|
| 46 |
"SUMM",
|
| 47 |
"Average",
|
| 48 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
if tasks is None:
|
| 50 |
breakpoint()
|
| 51 |
# based on the tasks, remove the columns that are not needed
|
|
@@ -65,14 +78,77 @@ def baseline_load_data(tasks):
|
|
| 65 |
column_names.remove("SUMM")
|
| 66 |
|
| 67 |
df = df[column_names]
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
df = df.drop_duplicates(subset=["Method"], keep="first")
|
| 70 |
|
| 71 |
return df
|
| 72 |
|
| 73 |
|
| 74 |
-
def load_data(tasks):
|
| 75 |
-
baseline_df = baseline_load_data(tasks)
|
| 76 |
|
| 77 |
return baseline_df
|
| 78 |
|
|
@@ -86,8 +162,29 @@ def search_leaderboard(df, query):
|
|
| 86 |
|
| 87 |
|
| 88 |
# Function to change the version of the leaderboard
|
| 89 |
-
def change_version(
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
return new_df
|
| 92 |
|
| 93 |
|
|
@@ -120,6 +217,57 @@ with demo:
|
|
| 120 |
label="Select Tasks",
|
| 121 |
choices=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
|
| 122 |
value=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
)
|
| 124 |
|
| 125 |
with gr.Row():
|
|
@@ -128,10 +276,22 @@ with demo:
|
|
| 128 |
show_label=False,
|
| 129 |
)
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
leaderboard_table = gr.components.Dataframe(
|
| 132 |
value=load_data(
|
| 133 |
# "baseline",
|
| 134 |
["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
|
|
|
|
| 135 |
),
|
| 136 |
interactive=True,
|
| 137 |
visible=True,
|
|
@@ -151,31 +311,156 @@ with demo:
|
|
| 151 |
|
| 152 |
search_bar.change(
|
| 153 |
search_leaderboard,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
inputs=[
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
],
|
| 159 |
outputs=leaderboard_table,
|
| 160 |
)
|
| 161 |
-
|
| 162 |
tasks_checkbox.change(
|
| 163 |
change_version,
|
| 164 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
outputs=leaderboard_table,
|
| 166 |
)
|
| 167 |
|
| 168 |
-
with gr.Accordion("Submit
|
| 169 |
with gr.Row():
|
| 170 |
with gr.Column():
|
| 171 |
-
method_name_textbox = gr.Textbox(label="Method
|
| 172 |
-
url_textbox = gr.Textbox(label="
|
| 173 |
-
with gr.Column():
|
| 174 |
organisation = gr.Textbox(label="Organisation")
|
| 175 |
mail = gr.Textbox(label="Contact email")
|
|
|
|
| 176 |
file_output = gr.File()
|
| 177 |
-
|
| 178 |
-
submit_button = gr.Button("Submit Eval")
|
| 179 |
submission_result = gr.Markdown()
|
| 180 |
submit_button.click(
|
| 181 |
add_new_eval,
|
|
@@ -221,5 +506,5 @@ with demo:
|
|
| 221 |
scheduler = BackgroundScheduler()
|
| 222 |
scheduler.add_job(restart_space, "interval", seconds=3600)
|
| 223 |
scheduler.start()
|
| 224 |
-
|
| 225 |
-
demo.launch(share=True)
|
|
|
|
| 7 |
|
| 8 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
| 9 |
CITATION_BUTTON_TEXT = r"""@inproceedings{iltur-2024,
|
| 10 |
+
title = "IL-TUR: Benchmark for Indian Legal Text Understanding and Reasoning",
|
| 11 |
+
author = "Joshi, Abhinav and Paul, Shounak and Sharma, Akshat and Goyal, Pawan and Ghosh, Saptarshi and Modi, Ashutosh"
|
| 12 |
+
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
|
| 13 |
+
month = aug,
|
| 14 |
+
year = "2024",
|
| 15 |
+
address = "Bangkok, Thailand",
|
| 16 |
+
publisher = "Association for Computational Linguistics",
|
| 17 |
+
}"""
|
|
|
|
| 18 |
|
| 19 |
api = HfApi()
|
| 20 |
TOKEN = os.environ.get("TOKEN", None)
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
# Function to load data from a given CSV file
|
| 29 |
+
def baseline_load_data(tasks, task_metrics):
|
| 30 |
# version = version.replace("%", "p")
|
| 31 |
file_path = f"submissions/baseline/baseline.csv" # Replace with your file paths
|
| 32 |
df = pd.read_csv(file_path)
|
|
|
|
| 45 |
"SUMM",
|
| 46 |
"Average",
|
| 47 |
]
|
| 48 |
+
# Method,Submitted by,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,L-MT
|
| 49 |
+
column_names = [
|
| 50 |
+
"Method",
|
| 51 |
+
"Submitted By",
|
| 52 |
+
"L-NER",
|
| 53 |
+
"RR",
|
| 54 |
+
"CJPE",
|
| 55 |
+
"BAIL",
|
| 56 |
+
"LSI",
|
| 57 |
+
"PCR",
|
| 58 |
+
"SUMM",
|
| 59 |
+
# "Average",
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
if tasks is None:
|
| 63 |
breakpoint()
|
| 64 |
# based on the tasks, remove the columns that are not needed
|
|
|
|
| 78 |
column_names.remove("SUMM")
|
| 79 |
|
| 80 |
df = df[column_names]
|
| 81 |
+
|
| 82 |
+
import json
|
| 83 |
+
|
| 84 |
+
# load the results json file
|
| 85 |
+
with open("submissions/baseline/results.json") as f:
|
| 86 |
+
results = json.load(f)
|
| 87 |
+
# add the results to the dataframe
|
| 88 |
+
# Method,Submitted By,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,L-MT
|
| 89 |
+
# Metric,-,strict mF1,mF1,mF1|ROUGE-L|BLEU,mF1,mF1,muF1@K,ROUGE-L|BERTSCORE,BLEU|GLEU|chrF++
|
| 90 |
+
# create a new df to display the results
|
| 91 |
+
results_df = pd.DataFrame(
|
| 92 |
+
columns=[
|
| 93 |
+
"Method",
|
| 94 |
+
"Submitted By",
|
| 95 |
+
"Github Link",
|
| 96 |
+
"L-NER",
|
| 97 |
+
"RR",
|
| 98 |
+
"CJPE",
|
| 99 |
+
"BAIL",
|
| 100 |
+
"LSI",
|
| 101 |
+
"PCR",
|
| 102 |
+
"SUMM",
|
| 103 |
+
"L-MT",
|
| 104 |
+
# "Average",
|
| 105 |
+
]
|
| 106 |
+
)
|
| 107 |
+
# breakpoint()
|
| 108 |
+
for entry in results:
|
| 109 |
+
results_df = results_df.append(
|
| 110 |
+
{
|
| 111 |
+
"Method": entry["Method"],
|
| 112 |
+
"Submitted By": entry["Submitted By"],
|
| 113 |
+
"Github Link": entry["Github Link"],
|
| 114 |
+
"L-NER": entry["L-NER"][task_metrics["L-NER"]],
|
| 115 |
+
"RR": entry["RR"][task_metrics["RR"]],
|
| 116 |
+
"CJPE": entry["CJPE"][task_metrics["CJPE"]],
|
| 117 |
+
"BAIL": entry["BAIL"][task_metrics["BAIL"]],
|
| 118 |
+
"LSI": entry["LSI"][task_metrics["LSI"]],
|
| 119 |
+
"PCR": entry["PCR"][task_metrics["PCR"]],
|
| 120 |
+
"SUMM": entry["SUMM"][task_metrics["SUMM"]],
|
| 121 |
+
"L-MT": entry["L-MT"][task_metrics["L-MT"]],
|
| 122 |
+
# "Average": ,
|
| 123 |
+
},
|
| 124 |
+
ignore_index=True,
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# breakpoint()
|
| 128 |
+
# add the average column
|
| 129 |
+
# results_df["Average"] = results_df.mean(axis=1)
|
| 130 |
+
|
| 131 |
+
df = results_df
|
| 132 |
+
# df = df.sort_values(by="Average", ascending=False)
|
| 133 |
+
# remove the columns that are not in tasks
|
| 134 |
+
selected_columns = (
|
| 135 |
+
[
|
| 136 |
+
"Method",
|
| 137 |
+
"Submitted By",
|
| 138 |
+
]
|
| 139 |
+
+ tasks
|
| 140 |
+
+ ["Github Link"]
|
| 141 |
+
)
|
| 142 |
+
print(tasks)
|
| 143 |
+
df = df[selected_columns]
|
| 144 |
+
|
| 145 |
df = df.drop_duplicates(subset=["Method"], keep="first")
|
| 146 |
|
| 147 |
return df
|
| 148 |
|
| 149 |
|
| 150 |
+
def load_data(tasks, task_metrics):
|
| 151 |
+
baseline_df = baseline_load_data(tasks, task_metrics)
|
| 152 |
|
| 153 |
return baseline_df
|
| 154 |
|
|
|
|
| 162 |
|
| 163 |
|
| 164 |
# Function to change the version of the leaderboard
|
| 165 |
+
def change_version(
|
| 166 |
+
tasks,
|
| 167 |
+
l_ner_metric,
|
| 168 |
+
rr_metric,
|
| 169 |
+
cjpe_metric,
|
| 170 |
+
bail_metric,
|
| 171 |
+
lsi_metric,
|
| 172 |
+
pcr_metric,
|
| 173 |
+
summ_metric,
|
| 174 |
+
lmt_metric,
|
| 175 |
+
):
|
| 176 |
+
task_metrics = {
|
| 177 |
+
"L-NER": l_ner_metric,
|
| 178 |
+
"RR": rr_metric,
|
| 179 |
+
"CJPE": cjpe_metric,
|
| 180 |
+
"BAIL": bail_metric,
|
| 181 |
+
"LSI": lsi_metric,
|
| 182 |
+
"PCR": pcr_metric,
|
| 183 |
+
"SUMM": summ_metric,
|
| 184 |
+
"L-MT": lmt_metric,
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
new_df = load_data(tasks, task_metrics)
|
| 188 |
return new_df
|
| 189 |
|
| 190 |
|
|
|
|
| 217 |
label="Select Tasks",
|
| 218 |
choices=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
|
| 219 |
value=["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
|
| 220 |
+
interactive=True,
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
with gr.Row():
|
| 224 |
+
l_ner_metric = gr.Radio(
|
| 225 |
+
label="L-NER",
|
| 226 |
+
choices=["strict mF1"],
|
| 227 |
+
value="strict mF1",
|
| 228 |
+
interactive=True,
|
| 229 |
+
)
|
| 230 |
+
rr_metric = gr.Radio(
|
| 231 |
+
label="RR",
|
| 232 |
+
choices=["mF1"],
|
| 233 |
+
value="mF1",
|
| 234 |
+
interactive=True,
|
| 235 |
+
)
|
| 236 |
+
cjpe_metric = gr.Radio(
|
| 237 |
+
label="CJPE",
|
| 238 |
+
choices=["mF1", "ROUGE-L", "BLEU"],
|
| 239 |
+
value="mF1",
|
| 240 |
+
interactive=True,
|
| 241 |
+
)
|
| 242 |
+
bail_metric = gr.Radio(
|
| 243 |
+
label="BAIL",
|
| 244 |
+
choices=["mF1"],
|
| 245 |
+
value="mF1",
|
| 246 |
+
interactive=True,
|
| 247 |
+
)
|
| 248 |
+
lsi_metric = gr.Radio(
|
| 249 |
+
label="LSI",
|
| 250 |
+
choices=["mF1"],
|
| 251 |
+
value="mF1",
|
| 252 |
+
interactive=True,
|
| 253 |
+
)
|
| 254 |
+
pcr_metric = gr.Radio(
|
| 255 |
+
label="PCR",
|
| 256 |
+
choices=["muF1@K"],
|
| 257 |
+
value="muF1@K",
|
| 258 |
+
interactive=True,
|
| 259 |
+
)
|
| 260 |
+
summ_metric = gr.Radio(
|
| 261 |
+
label="SUMM",
|
| 262 |
+
choices=["ROUGE-L", "BERTSCORE"],
|
| 263 |
+
value="ROUGE-L",
|
| 264 |
+
interactive=True,
|
| 265 |
+
)
|
| 266 |
+
lmt_metric = gr.Radio(
|
| 267 |
+
label="L-MT",
|
| 268 |
+
choices=["BLEU", "GLEU", "chrF++"],
|
| 269 |
+
value="BLEU",
|
| 270 |
+
interactive=True,
|
| 271 |
)
|
| 272 |
|
| 273 |
with gr.Row():
|
|
|
|
| 276 |
show_label=False,
|
| 277 |
)
|
| 278 |
|
| 279 |
+
task_metrics = {
|
| 280 |
+
"L-NER": l_ner_metric.value,
|
| 281 |
+
"RR": rr_metric.value,
|
| 282 |
+
"CJPE": cjpe_metric.value,
|
| 283 |
+
"BAIL": bail_metric.value,
|
| 284 |
+
"LSI": lsi_metric.value,
|
| 285 |
+
"PCR": pcr_metric.value,
|
| 286 |
+
"SUMM": summ_metric.value,
|
| 287 |
+
"L-MT": lmt_metric.value,
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
leaderboard_table = gr.components.Dataframe(
|
| 291 |
value=load_data(
|
| 292 |
# "baseline",
|
| 293 |
["L-NER", "RR", "CJPE", "BAIL", "LSI", "PCR", "SUMM"],
|
| 294 |
+
task_metrics=task_metrics,
|
| 295 |
),
|
| 296 |
interactive=True,
|
| 297 |
visible=True,
|
|
|
|
| 311 |
|
| 312 |
search_bar.change(
|
| 313 |
search_leaderboard,
|
| 314 |
+
inputs=[leaderboard_table, search_bar],
|
| 315 |
+
outputs=leaderboard_table,
|
| 316 |
+
)
|
| 317 |
+
# breakpoint()
|
| 318 |
+
l_ner_metric.change(
|
| 319 |
+
change_version,
|
| 320 |
inputs=[
|
| 321 |
+
tasks_checkbox,
|
| 322 |
+
l_ner_metric,
|
| 323 |
+
rr_metric,
|
| 324 |
+
cjpe_metric,
|
| 325 |
+
bail_metric,
|
| 326 |
+
lsi_metric,
|
| 327 |
+
pcr_metric,
|
| 328 |
+
summ_metric,
|
| 329 |
+
lmt_metric,
|
| 330 |
+
],
|
| 331 |
+
outputs=leaderboard_table,
|
| 332 |
+
)
|
| 333 |
+
rr_metric.change(
|
| 334 |
+
change_version,
|
| 335 |
+
inputs=[
|
| 336 |
+
tasks_checkbox,
|
| 337 |
+
l_ner_metric,
|
| 338 |
+
rr_metric,
|
| 339 |
+
cjpe_metric,
|
| 340 |
+
bail_metric,
|
| 341 |
+
lsi_metric,
|
| 342 |
+
pcr_metric,
|
| 343 |
+
summ_metric,
|
| 344 |
+
lmt_metric,
|
| 345 |
+
],
|
| 346 |
+
outputs=leaderboard_table,
|
| 347 |
+
)
|
| 348 |
+
cjpe_metric.change(
|
| 349 |
+
change_version,
|
| 350 |
+
inputs=[
|
| 351 |
+
tasks_checkbox,
|
| 352 |
+
l_ner_metric,
|
| 353 |
+
rr_metric,
|
| 354 |
+
cjpe_metric,
|
| 355 |
+
bail_metric,
|
| 356 |
+
lsi_metric,
|
| 357 |
+
pcr_metric,
|
| 358 |
+
summ_metric,
|
| 359 |
+
lmt_metric,
|
| 360 |
+
],
|
| 361 |
+
outputs=leaderboard_table,
|
| 362 |
+
)
|
| 363 |
+
bail_metric.change(
|
| 364 |
+
change_version,
|
| 365 |
+
inputs=[
|
| 366 |
+
tasks_checkbox,
|
| 367 |
+
l_ner_metric,
|
| 368 |
+
rr_metric,
|
| 369 |
+
cjpe_metric,
|
| 370 |
+
bail_metric,
|
| 371 |
+
lsi_metric,
|
| 372 |
+
pcr_metric,
|
| 373 |
+
summ_metric,
|
| 374 |
+
lmt_metric,
|
| 375 |
+
],
|
| 376 |
+
outputs=leaderboard_table,
|
| 377 |
+
)
|
| 378 |
+
lsi_metric.change(
|
| 379 |
+
change_version,
|
| 380 |
+
inputs=[
|
| 381 |
+
tasks_checkbox,
|
| 382 |
+
l_ner_metric,
|
| 383 |
+
rr_metric,
|
| 384 |
+
cjpe_metric,
|
| 385 |
+
bail_metric,
|
| 386 |
+
lsi_metric,
|
| 387 |
+
pcr_metric,
|
| 388 |
+
summ_metric,
|
| 389 |
+
lmt_metric,
|
| 390 |
+
],
|
| 391 |
+
outputs=leaderboard_table,
|
| 392 |
+
)
|
| 393 |
+
pcr_metric.change(
|
| 394 |
+
change_version,
|
| 395 |
+
inputs=[
|
| 396 |
+
tasks_checkbox,
|
| 397 |
+
l_ner_metric,
|
| 398 |
+
rr_metric,
|
| 399 |
+
cjpe_metric,
|
| 400 |
+
bail_metric,
|
| 401 |
+
lsi_metric,
|
| 402 |
+
pcr_metric,
|
| 403 |
+
summ_metric,
|
| 404 |
+
lmt_metric,
|
| 405 |
+
],
|
| 406 |
+
outputs=leaderboard_table,
|
| 407 |
+
)
|
| 408 |
+
summ_metric.change(
|
| 409 |
+
change_version,
|
| 410 |
+
inputs=[
|
| 411 |
+
tasks_checkbox,
|
| 412 |
+
l_ner_metric,
|
| 413 |
+
rr_metric,
|
| 414 |
+
cjpe_metric,
|
| 415 |
+
bail_metric,
|
| 416 |
+
lsi_metric,
|
| 417 |
+
pcr_metric,
|
| 418 |
+
summ_metric,
|
| 419 |
+
lmt_metric,
|
| 420 |
+
],
|
| 421 |
+
outputs=leaderboard_table,
|
| 422 |
+
)
|
| 423 |
+
lmt_metric.change(
|
| 424 |
+
change_version,
|
| 425 |
+
inputs=[
|
| 426 |
+
tasks_checkbox,
|
| 427 |
+
l_ner_metric,
|
| 428 |
+
rr_metric,
|
| 429 |
+
cjpe_metric,
|
| 430 |
+
bail_metric,
|
| 431 |
+
lsi_metric,
|
| 432 |
+
pcr_metric,
|
| 433 |
+
summ_metric,
|
| 434 |
+
lmt_metric,
|
| 435 |
],
|
| 436 |
outputs=leaderboard_table,
|
| 437 |
)
|
|
|
|
| 438 |
tasks_checkbox.change(
|
| 439 |
change_version,
|
| 440 |
+
inputs=[
|
| 441 |
+
tasks_checkbox,
|
| 442 |
+
l_ner_metric,
|
| 443 |
+
rr_metric,
|
| 444 |
+
cjpe_metric,
|
| 445 |
+
bail_metric,
|
| 446 |
+
lsi_metric,
|
| 447 |
+
pcr_metric,
|
| 448 |
+
summ_metric,
|
| 449 |
+
lmt_metric,
|
| 450 |
+
],
|
| 451 |
outputs=leaderboard_table,
|
| 452 |
)
|
| 453 |
|
| 454 |
+
with gr.Accordion("Submit the results of your Method"):
|
| 455 |
with gr.Row():
|
| 456 |
with gr.Column():
|
| 457 |
+
method_name_textbox = gr.Textbox(label="Method")
|
| 458 |
+
url_textbox = gr.Textbox(label="Github Link")
|
|
|
|
| 459 |
organisation = gr.Textbox(label="Organisation")
|
| 460 |
mail = gr.Textbox(label="Contact email")
|
| 461 |
+
with gr.Column():
|
| 462 |
file_output = gr.File()
|
| 463 |
+
submit_button = gr.Button("Submit Eval")
|
|
|
|
| 464 |
submission_result = gr.Markdown()
|
| 465 |
submit_button.click(
|
| 466 |
add_new_eval,
|
|
|
|
| 506 |
scheduler = BackgroundScheduler()
|
| 507 |
scheduler.add_job(restart_space, "interval", seconds=3600)
|
| 508 |
scheduler.start()
|
| 509 |
+
demo.launch(debug=True)
|
| 510 |
+
# demo.launch(share=True)
|
dummy.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
|
| 3 |
+
# load the results json file
|
| 4 |
+
with open("submissions/baseline/results.json") as f:
|
| 5 |
+
results = json.load(f)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
# update the results
|
| 9 |
+
with open("submissions/baseline/submission.json") as f:
|
| 10 |
+
submission = json.load(f)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
breakpoint()
|
| 14 |
+
# update the results
|
| 15 |
+
results.append(submission[0])
|
submissions/.DS_Store
CHANGED
|
Binary files a/submissions/.DS_Store and b/submissions/.DS_Store differ
|
|
|
submissions/baseline/baseline -pre2.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Method,Submitted By,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,L-MT
|
| 2 |
+
Metric,-,strict mF1,mF1,mF1|ROUGE-L|BLEU,mF1,mF1,muF1@K,ROUGE-L|BERTSCORE,BLEU|GLEU|chrF++
|
| 3 |
+
SOTA,various,48.58,69.01,81.31|56.00|32.00,81,28.08,39.15,33.00|86.00,28.00|32.00|57.00
|
| 4 |
+
BERT,various,39.59,58,71.14|-|-,-,18.44,9.24,-|-,-|-|-
|
| 5 |
+
LegalBERT,various,45.58,54,78.21|-|-,-,21.74,8.67,-|-,-|-|-
|
| 6 |
+
InLegalBERT,various,48.58,58,81.31|-|-,-,26.23,7.57,-|-,-|-|-
|
| 7 |
+
GPT-3.5 (0-shot),IL-TUR team,30.59,30.95,54.17|30.00|8.00,51.04,21.55,-,21.00|85.00,23.00|28.00|42.00
|
| 8 |
+
GPT-3.5 (1-shot),IL-TUR team,23.68,30.05,51.46|29.00|15.00,46.35,22.61,-,20.00|84.00,25.00|28.00|43.00
|
| 9 |
+
GPT-3.5 (2-shot),IL-TUR team,32.84,30.31,56.74|30.00|11.00,61,21.4,-,22.00|84.00,26.00|29.00|43.00
|
| 10 |
+
GPT-4 (0-shot),IL-TUR team,13.65,37.37,68.29|40.00|14.00,51.46,23.99,-,23.00|85.00,33.00|36.00|50.00
|
| 11 |
+
GPT-4 (1-shot),IL-TUR team,10.51,37.43,47.26|39.00|16.00,56.9,22.26,-,16.00|81.00,35.00|38.00|52.00
|
| 12 |
+
GPT-4 (2-shot),IL-TUR team,24.03,38.18,60.44|43.00|18.00,66.67,20.53,-,17.00|81.00,36.00|39.00|53.00
|
submissions/baseline/baseline-pre.csv
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Unnamed: 0,index,Method,Submitted By,L-NER,RR,CJPE,BAIL,LSI,PCR,SUMM,Average
|
| 2 |
+
,0,baseline,baseline,0,0,0,0,0,0,0,0
|
| 3 |
+
,0,baseline2,baseline2,0,0,0,0,0,0,0,0
|
| 4 |
+
,0,baseline,baseline,0,0,0,0,0,0,0,0
|
| 5 |
+
,0,random,random,0,0,0,0,0,0,0,0
|
| 6 |
+
,0,random2,random22,0,0,0,0,0,0,0,0
|
| 7 |
+
,0,random5,random55,0,0,0,0,0,0,0,0
|
submissions/baseline/baseline.csv
CHANGED
|
@@ -1,7 +1,11 @@
|
|
| 1 |
-
|
| 2 |
-
,
|
| 3 |
-
,
|
| 4 |
-
,
|
| 5 |
-
,
|
| 6 |
-
|
| 7 |
-
,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Method,Submitted By,L-NER strict mF1,RR mF1,CJPE mF1,CJPE ROUGE-L,CJPE BLEU,BAIL mF1,LSI mF1,PCR muF1@K,SUMM ROUGE-L,SUMM BERTSCORE,L-MT BLEU,L-MT GLEU,L-MT chrF++
|
| 2 |
+
SOTA,various,48.58,69.01,81.31,56.00,32.00,81,28.08,39.15,33.00,86.00,28.00,32.00,57.00
|
| 3 |
+
BERT,various,39.59,58,71.14,-,-,-,18.44,9.24,-,-,-,-,-
|
| 4 |
+
LegalBERT,various,45.58,54,78.21,-,-,-,21.74,8.67,-,-,-,-,-
|
| 5 |
+
InLegalBERT,various,48.58,58,81.31,-,-,-,26.23,7.57,-,-,-,-,-
|
| 6 |
+
GPT-3.5 (0-shot),IL-TUR team,30.59,30.95,54.17,30.00,8.00,51.04,21.55,-,21.00,85.00,23.00,28.00,42.00
|
| 7 |
+
GPT-3.5 (1-shot),IL-TUR team,23.68,30.05,51.46,29.00,15.00,46.35,22.61,-,20.00,84.00,25.00,28.00,43.00
|
| 8 |
+
GPT-3.5 (2-shot),IL-TUR team,32.84,30.31,56.74,30.00,11.00,61,21.4,-,22.00,84.00,26.00,29.00,43.00
|
| 9 |
+
GPT-4 (0-shot),IL-TUR team,13.65,37.37,68.29,40.00,14.00,51.46,23.99,-,23.00,85.00,33.00,36.00,50.00
|
| 10 |
+
GPT-4 (1-shot),IL-TUR team,10.51,37.43,47.26,39.00,16.00,56.9,22.26,-,16.00,81.00,35.00,38.00,52.00
|
| 11 |
+
GPT-4 (2-shot),IL-TUR team,24.03,38.18,60.44,43.00,18.00,66.67,20.53,-,17.00,81.00,36.00,39.00,53.00
|
submissions/baseline/results-bacup.json
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Method": "SOTA",
|
| 4 |
+
"Submitted By": "multiple",
|
| 5 |
+
"Github Link": "exploration-lab.github.io/IL-TUR/",
|
| 6 |
+
"L-NER": {"strict mF1": "48.58"},
|
| 7 |
+
"RR": {"mF1": "69.01"},
|
| 8 |
+
"CJPE": {"mF1": "81.31", "ROUGE-L": "56.00", "BLEU": "32.00"},
|
| 9 |
+
"BAIL": {"mF1": "81"},
|
| 10 |
+
"LSI": {"mF1": "28.08"},
|
| 11 |
+
"PCR": {"muF1@K": "39.15"},
|
| 12 |
+
"SUMM": {"ROUGE-L": "33.00", "BERTSCORE": "86.00"},
|
| 13 |
+
"L-MT": {"BLEU": "28.00", "GLEU": "32.00", "chrF++": "57.00"}
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"Method": "BERT",
|
| 17 |
+
"Submitted By": "multiple",
|
| 18 |
+
"Github Link": "",
|
| 19 |
+
"L-NER": {"strict mF1": "39.59"},
|
| 20 |
+
"RR": {"mF1": "58"},
|
| 21 |
+
"CJPE": {"mF1": "71.14", "ROUGE-L": "-", "BLEU": "-"},
|
| 22 |
+
"BAIL": {"mF1": "-"},
|
| 23 |
+
"LSI": {"mF1": "-"},
|
| 24 |
+
"PCR": {"muF1@K": "18.44"},
|
| 25 |
+
"SUMM": {"ROUGE-L": "9.24", "BERTSCORE": "-"},
|
| 26 |
+
"L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"Method": "LegalBERT",
|
| 30 |
+
"Submitted By": "multiple",
|
| 31 |
+
"Github Link": "",
|
| 32 |
+
"L-NER": {"strict mF1": "45.58"},
|
| 33 |
+
"RR": {"mF1": "54"},
|
| 34 |
+
"CJPE": {"mF1": "78.21", "ROUGE-L": "-", "BLEU": "-"},
|
| 35 |
+
"BAIL": {"mF1": "-"},
|
| 36 |
+
"LSI": {"mF1": "-"},
|
| 37 |
+
"PCR": {"muF1@K": "21.74"},
|
| 38 |
+
"SUMM": {"ROUGE-L": "8.67", "BERTSCORE": "-"},
|
| 39 |
+
"L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"Method": "InLegalBERT",
|
| 43 |
+
"Submitted By": "multiple",
|
| 44 |
+
"Github Link": "",
|
| 45 |
+
"L-NER": {"strict mF1": "48.58"},
|
| 46 |
+
"RR": {"mF1": "58"},
|
| 47 |
+
"CJPE": {"mF1": "81.31", "ROUGE-L": "-", "BLEU": "-"},
|
| 48 |
+
"BAIL": {"mF1": "-"},
|
| 49 |
+
"LSI": {"mF1": "-"},
|
| 50 |
+
"PCR": {"muF1@K": "26.23"},
|
| 51 |
+
"SUMM": {"ROUGE-L": "7.57", "BERTSCORE": "-"},
|
| 52 |
+
"L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"Method": "GPT-3.5 (0-shot)",
|
| 56 |
+
"Submitted By": "IL-TUR",
|
| 57 |
+
"Github Link": "",
|
| 58 |
+
"L-NER": {"strict mF1": "30.59"},
|
| 59 |
+
"RR": {"mF1": "30.95"},
|
| 60 |
+
"CJPE": {"mF1": "54.17", "ROUGE-L": "30.00", "BLEU": "8.00"},
|
| 61 |
+
"BAIL": {"mF1": "51.04"},
|
| 62 |
+
"LSI": {"mF1": "21.55"},
|
| 63 |
+
"PCR": {"muF1@K": "-"},
|
| 64 |
+
"SUMM": {"ROUGE-L": "21.00", "BERTSCORE": "85.00"},
|
| 65 |
+
"L-MT": {"BLEU": "23.00", "GLEU": "28.00", "chrF++": "42.00"}
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"Method": "GPT-3.5 (1-shot)",
|
| 69 |
+
"Submitted By": "IL-TUR",
|
| 70 |
+
"Github Link": "",
|
| 71 |
+
"L-NER": {"strict mF1": "23.68"},
|
| 72 |
+
"RR": {"mF1": "30.05"},
|
| 73 |
+
"CJPE": {"mF1": "51.46", "ROUGE-L": "29.00", "BLEU": "15.00"},
|
| 74 |
+
"BAIL": {"mF1": "46.35"},
|
| 75 |
+
"LSI": {"mF1": "22.61"},
|
| 76 |
+
"PCR": {"muF1@K": "-"},
|
| 77 |
+
"SUMM": {"ROUGE-L": "20.00", "BERTSCORE": "84.00"},
|
| 78 |
+
"L-MT": {"BLEU": "25.00", "GLEU": "28.00", "chrF++": "43.00"}
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"Method": "GPT-3.5 (2-shot)",
|
| 82 |
+
"Submitted By": "IL-TUR",
|
| 83 |
+
"Github Link": "",
|
| 84 |
+
"L-NER": {"strict mF1": "32.84"},
|
| 85 |
+
"RR": {"mF1": "30.31"},
|
| 86 |
+
"CJPE": {"mF1": "56.74", "ROUGE-L": "30.00", "BLEU": "11.00"},
|
| 87 |
+
"BAIL": {"mF1": "61"},
|
| 88 |
+
"LSI": {"mF1": "21.4"},
|
| 89 |
+
"PCR": {"muF1@K": "-"},
|
| 90 |
+
"SUMM": {"ROUGE-L": "22.00", "BERTSCORE": "84.00"},
|
| 91 |
+
"L-MT": {"BLEU": "26.00", "GLEU": "29.00", "chrF++": "43.00"}
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"Method": "GPT-4 (0-shot)",
|
| 95 |
+
"Submitted By": "IL-TUR",
|
| 96 |
+
"Github Link": "",
|
| 97 |
+
"L-NER": {"strict mF1": "13.65"},
|
| 98 |
+
"RR": {"mF1": "37.37"},
|
| 99 |
+
"CJPE": {"mF1": "68.29", "ROUGE-L": "40.00", "BLEU": "14.00"},
|
| 100 |
+
"BAIL": {"mF1": "51.46"},
|
| 101 |
+
"LSI": {"mF1": "23.99"},
|
| 102 |
+
"PCR": {"muF1@K": "-"},
|
| 103 |
+
"SUMM": {"ROUGE-L": "23.00", "BERTSCORE": "85.00"},
|
| 104 |
+
"L-MT": {"BLEU": "33.00", "GLEU": "36.00", "chrF++": "50.00"}
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"Method": "GPT-4 (1-shot)",
|
| 108 |
+
"Submitted By": "IL-TUR",
|
| 109 |
+
"Github Link": "",
|
| 110 |
+
"L-NER": {"strict mF1": "10.51"},
|
| 111 |
+
"RR": {"mF1": "37.43"},
|
| 112 |
+
"CJPE": {"mF1": "47.26", "ROUGE-L": "39.00", "BLEU": "16.00"},
|
| 113 |
+
"BAIL": {"mF1": "56.9"},
|
| 114 |
+
"LSI": {"mF1": "22.26"},
|
| 115 |
+
"PCR": {"muF1@K": "-"},
|
| 116 |
+
"SUMM": {"ROUGE-L": "16.00", "BERTSCORE": "81.00"},
|
| 117 |
+
"L-MT": {"BLEU": "35.00", "GLEU": "38.00", "chrF++": "52.00"}
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
"Method": "GPT-4 (2-shot)",
|
| 121 |
+
"Submitted By": "IL-TUR",
|
| 122 |
+
"Github Link": "",
|
| 123 |
+
"L-NER": {"strict mF1": "24.03"},
|
| 124 |
+
"RR": {"mF1": "38.18"},
|
| 125 |
+
"CJPE": {"mF1": "60.44", "ROUGE-L": "43.00", "BLEU": "18.00"},
|
| 126 |
+
"BAIL": {"mF1": "66.67"},
|
| 127 |
+
"LSI": {"mF1": "20.53"},
|
| 128 |
+
"PCR": {"muF1@K": "-"},
|
| 129 |
+
"SUMM": {"ROUGE-L": "17.00", "BERTSCORE": "81.00"},
|
| 130 |
+
"L-MT": {"BLEU": "36.00", "GLEU": "39.00", "chrF++": "53.00"}
|
| 131 |
+
}
|
| 132 |
+
]
|
| 133 |
+
|
submissions/baseline/results.json
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Method": "SOTA",
|
| 4 |
+
"Submitted By": "multiple",
|
| 5 |
+
"Github Link": "exploration-lab.github.io/IL-TUR/",
|
| 6 |
+
"L-NER": {"strict mF1": "48.58"},
|
| 7 |
+
"RR": {"mF1": "69.01"},
|
| 8 |
+
"CJPE": {"mF1": "81.31", "ROUGE-L": "56.00", "BLEU": "32.00"},
|
| 9 |
+
"BAIL": {"mF1": "81"},
|
| 10 |
+
"LSI": {"mF1": "28.08"},
|
| 11 |
+
"PCR": {"muF1@K": "39.15"},
|
| 12 |
+
"SUMM": {"ROUGE-L": "33.00", "BERTSCORE": "86.00"},
|
| 13 |
+
"L-MT": {"BLEU": "28.00", "GLEU": "32.00", "chrF++": "57.00"}
|
| 14 |
+
},
|
| 15 |
+
{
|
| 16 |
+
"Method": "BERT",
|
| 17 |
+
"Submitted By": "multiple",
|
| 18 |
+
"Github Link": "",
|
| 19 |
+
"L-NER": {"strict mF1": "39.59"},
|
| 20 |
+
"RR": {"mF1": "58"},
|
| 21 |
+
"CJPE": {"mF1": "71.14", "ROUGE-L": "-", "BLEU": "-"},
|
| 22 |
+
"BAIL": {"mF1": "-"},
|
| 23 |
+
"LSI": {"mF1": "-"},
|
| 24 |
+
"PCR": {"muF1@K": "18.44"},
|
| 25 |
+
"SUMM": {"ROUGE-L": "9.24", "BERTSCORE": "-"},
|
| 26 |
+
"L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"Method": "LegalBERT",
|
| 30 |
+
"Submitted By": "multiple",
|
| 31 |
+
"Github Link": "",
|
| 32 |
+
"L-NER": {"strict mF1": "45.58"},
|
| 33 |
+
"RR": {"mF1": "54"},
|
| 34 |
+
"CJPE": {"mF1": "78.21", "ROUGE-L": "-", "BLEU": "-"},
|
| 35 |
+
"BAIL": {"mF1": "-"},
|
| 36 |
+
"LSI": {"mF1": "-"},
|
| 37 |
+
"PCR": {"muF1@K": "21.74"},
|
| 38 |
+
"SUMM": {"ROUGE-L": "8.67", "BERTSCORE": "-"},
|
| 39 |
+
"L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"Method": "InLegalBERT",
|
| 43 |
+
"Submitted By": "multiple",
|
| 44 |
+
"Github Link": "",
|
| 45 |
+
"L-NER": {"strict mF1": "48.58"},
|
| 46 |
+
"RR": {"mF1": "58"},
|
| 47 |
+
"CJPE": {"mF1": "81.31", "ROUGE-L": "-", "BLEU": "-"},
|
| 48 |
+
"BAIL": {"mF1": "-"},
|
| 49 |
+
"LSI": {"mF1": "-"},
|
| 50 |
+
"PCR": {"muF1@K": "26.23"},
|
| 51 |
+
"SUMM": {"ROUGE-L": "7.57", "BERTSCORE": "-"},
|
| 52 |
+
"L-MT": {"BLEU": "-", "GLEU": "-", "chrF++": "-"}
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"Method": "GPT-3.5 (0-shot)",
|
| 56 |
+
"Submitted By": "IL-TUR",
|
| 57 |
+
"Github Link": "",
|
| 58 |
+
"L-NER": {"strict mF1": "30.59"},
|
| 59 |
+
"RR": {"mF1": "30.95"},
|
| 60 |
+
"CJPE": {"mF1": "54.17", "ROUGE-L": "30.00", "BLEU": "8.00"},
|
| 61 |
+
"BAIL": {"mF1": "51.04"},
|
| 62 |
+
"LSI": {"mF1": "21.55"},
|
| 63 |
+
"PCR": {"muF1@K": "-"},
|
| 64 |
+
"SUMM": {"ROUGE-L": "21.00", "BERTSCORE": "85.00"},
|
| 65 |
+
"L-MT": {"BLEU": "23.00", "GLEU": "28.00", "chrF++": "42.00"}
|
| 66 |
+
},
|
| 67 |
+
{
|
| 68 |
+
"Method": "GPT-3.5 (1-shot)",
|
| 69 |
+
"Submitted By": "IL-TUR",
|
| 70 |
+
"Github Link": "",
|
| 71 |
+
"L-NER": {"strict mF1": "23.68"},
|
| 72 |
+
"RR": {"mF1": "30.05"},
|
| 73 |
+
"CJPE": {"mF1": "51.46", "ROUGE-L": "29.00", "BLEU": "15.00"},
|
| 74 |
+
"BAIL": {"mF1": "46.35"},
|
| 75 |
+
"LSI": {"mF1": "22.61"},
|
| 76 |
+
"PCR": {"muF1@K": "-"},
|
| 77 |
+
"SUMM": {"ROUGE-L": "20.00", "BERTSCORE": "84.00"},
|
| 78 |
+
"L-MT": {"BLEU": "25.00", "GLEU": "28.00", "chrF++": "43.00"}
|
| 79 |
+
},
|
| 80 |
+
{
|
| 81 |
+
"Method": "GPT-3.5 (2-shot)",
|
| 82 |
+
"Submitted By": "IL-TUR",
|
| 83 |
+
"Github Link": "",
|
| 84 |
+
"L-NER": {"strict mF1": "32.84"},
|
| 85 |
+
"RR": {"mF1": "30.31"},
|
| 86 |
+
"CJPE": {"mF1": "56.74", "ROUGE-L": "30.00", "BLEU": "11.00"},
|
| 87 |
+
"BAIL": {"mF1": "61"},
|
| 88 |
+
"LSI": {"mF1": "21.4"},
|
| 89 |
+
"PCR": {"muF1@K": "-"},
|
| 90 |
+
"SUMM": {"ROUGE-L": "22.00", "BERTSCORE": "84.00"},
|
| 91 |
+
"L-MT": {"BLEU": "26.00", "GLEU": "29.00", "chrF++": "43.00"}
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"Method": "GPT-4 (0-shot)",
|
| 95 |
+
"Submitted By": "IL-TUR",
|
| 96 |
+
"Github Link": "",
|
| 97 |
+
"L-NER": {"strict mF1": "13.65"},
|
| 98 |
+
"RR": {"mF1": "37.37"},
|
| 99 |
+
"CJPE": {"mF1": "68.29", "ROUGE-L": "40.00", "BLEU": "14.00"},
|
| 100 |
+
"BAIL": {"mF1": "51.46"},
|
| 101 |
+
"LSI": {"mF1": "23.99"},
|
| 102 |
+
"PCR": {"muF1@K": "-"},
|
| 103 |
+
"SUMM": {"ROUGE-L": "23.00", "BERTSCORE": "85.00"},
|
| 104 |
+
"L-MT": {"BLEU": "33.00", "GLEU": "36.00", "chrF++": "50.00"}
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"Method": "GPT-4 (1-shot)",
|
| 108 |
+
"Submitted By": "IL-TUR",
|
| 109 |
+
"Github Link": "",
|
| 110 |
+
"L-NER": {"strict mF1": "10.51"},
|
| 111 |
+
"RR": {"mF1": "37.43"},
|
| 112 |
+
"CJPE": {"mF1": "47.26", "ROUGE-L": "39.00", "BLEU": "16.00"},
|
| 113 |
+
"BAIL": {"mF1": "56.9"},
|
| 114 |
+
"LSI": {"mF1": "22.26"},
|
| 115 |
+
"PCR": {"muF1@K": "-"},
|
| 116 |
+
"SUMM": {"ROUGE-L": "16.00", "BERTSCORE": "81.00"},
|
| 117 |
+
"L-MT": {"BLEU": "35.00", "GLEU": "38.00", "chrF++": "52.00"}
|
| 118 |
+
},
|
| 119 |
+
{
|
| 120 |
+
"Method": "GPT-4 (2-shot)",
|
| 121 |
+
"Submitted By": "IL-TUR",
|
| 122 |
+
"Github Link": "",
|
| 123 |
+
"L-NER": {"strict mF1": "24.03"},
|
| 124 |
+
"RR": {"mF1": "38.18"},
|
| 125 |
+
"CJPE": {"mF1": "60.44", "ROUGE-L": "43.00", "BLEU": "18.00"},
|
| 126 |
+
"BAIL": {"mF1": "66.67"},
|
| 127 |
+
"LSI": {"mF1": "20.53"},
|
| 128 |
+
"PCR": {"muF1@K": "-"},
|
| 129 |
+
"SUMM": {"ROUGE-L": "17.00", "BERTSCORE": "81.00"},
|
| 130 |
+
"L-MT": {"BLEU": "36.00", "GLEU": "39.00", "chrF++": "53.00"}
|
| 131 |
+
}
|
| 132 |
+
]
|
| 133 |
+
|
submissions/baseline/submission.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"Method": "GPT-5 (2-shot)",
|
| 4 |
+
"Submitted By": "IL-TUR",
|
| 5 |
+
"Github Link": "dummy submission",
|
| 6 |
+
"L-NER": {"strict mF1": "24.03"},
|
| 7 |
+
"RR": {"mF1": "38.18"},
|
| 8 |
+
"CJPE": {"mF1": "60.44", "ROUGE-L": "43.00", "BLEU": "18.00"},
|
| 9 |
+
"BAIL": {"mF1": "66.67"},
|
| 10 |
+
"LSI": {"mF1": "20.53"},
|
| 11 |
+
"PCR": {"muF1@K": "-"},
|
| 12 |
+
"SUMM": {"ROUGE-L": "17.00", "BERTSCORE": "81.00"},
|
| 13 |
+
"L-MT": {"BLEU": "36.00", "GLEU": "39.00", "chrF++": "53.00"}
|
| 14 |
+
}
|
| 15 |
+
]
|
| 16 |
+
|
uploads.py
CHANGED
|
@@ -2,6 +2,7 @@ from email.utils import parseaddr
|
|
| 2 |
from huggingface_hub import HfApi
|
| 3 |
import os
|
| 4 |
import datetime
|
|
|
|
| 5 |
import pandas as pd
|
| 6 |
|
| 7 |
LEADERBOARD_PATH = "Exploration-Lab/IL-TUR-Leaderboard"
|
|
@@ -59,54 +60,72 @@ def add_new_eval(
|
|
| 59 |
mail,
|
| 60 |
)
|
| 61 |
|
| 62 |
-
# load the file
|
| 63 |
-
df = pd.read_csv(path_to_file)
|
| 64 |
-
submission_df = pd.read_csv(path_to_file)
|
| 65 |
|
| 66 |
-
# modify the df to include metadata
|
| 67 |
-
df["Method"] = method_name
|
| 68 |
-
df["url"] = url
|
| 69 |
-
df["organisation"] = organisation
|
| 70 |
-
df["mail"] = parsed_mail
|
| 71 |
-
df["timestamp"] = datetime.datetime.now()
|
| 72 |
|
| 73 |
-
submission_df = pd.read_csv(path_to_file)
|
| 74 |
-
submission_df["Method"] = method_name
|
| 75 |
-
submission_df["Submitted By"] = organisation
|
| 76 |
-
# upload to spaces using the hf api at
|
| 77 |
|
| 78 |
-
path_in_repo = f"submissions/{method_name}"
|
| 79 |
-
file_name = f"{method_name}-{organisation}-{datetime.datetime.now().strftime('%Y-%m-%d')}.csv"
|
| 80 |
|
| 81 |
# upload the df to spaces
|
| 82 |
import io
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
path_in_repo=f"{path_in_repo}/{file_name}",
|
| 91 |
-
path_or_fileobj=buffer,
|
| 92 |
-
token=TOKEN,
|
| 93 |
-
repo_type="dataset",
|
| 94 |
-
)
|
| 95 |
-
# read the leaderboard
|
| 96 |
-
leaderboard_df = pd.read_csv(f"submissions/baseline/baseline.csv")
|
| 97 |
|
| 98 |
-
#
|
| 99 |
-
|
| 100 |
-
leaderboard_df = pd.concat([leaderboard_df, submission_df], ignore_index=True)
|
| 101 |
|
| 102 |
-
# save the new leaderboard
|
| 103 |
-
# leaderboard_df.to_csv(f"submissions/baseline/baseline.csv", index=False)
|
| 104 |
leaderboard_buffer = io.BytesIO()
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
leaderboard_buffer.seek(0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
api.upload_file(
|
| 108 |
repo_id=LEADERBOARD_PATH,
|
| 109 |
-
path_in_repo=f"submissions/baseline/baseline.csv",
|
|
|
|
| 110 |
path_or_fileobj=leaderboard_buffer,
|
| 111 |
token=TOKEN,
|
| 112 |
repo_type="space",
|
|
|
|
| 2 |
from huggingface_hub import HfApi
|
| 3 |
import os
|
| 4 |
import datetime
|
| 5 |
+
import json
|
| 6 |
import pandas as pd
|
| 7 |
|
| 8 |
LEADERBOARD_PATH = "Exploration-Lab/IL-TUR-Leaderboard"
|
|
|
|
| 60 |
mail,
|
| 61 |
)
|
| 62 |
|
| 63 |
+
# # load the file
|
| 64 |
+
# df = pd.read_csv(path_to_file)
|
| 65 |
+
# submission_df = pd.read_csv(path_to_file)
|
| 66 |
|
| 67 |
+
# # modify the df to include metadata
|
| 68 |
+
# df["Method"] = method_name
|
| 69 |
+
# df["url"] = url
|
| 70 |
+
# df["organisation"] = organisation
|
| 71 |
+
# df["mail"] = parsed_mail
|
| 72 |
+
# df["timestamp"] = datetime.datetime.now()
|
| 73 |
|
| 74 |
+
# submission_df = pd.read_csv(path_to_file)
|
| 75 |
+
# submission_df["Method"] = method_name
|
| 76 |
+
# submission_df["Submitted By"] = organisation
|
| 77 |
+
# # upload to spaces using the hf api at
|
| 78 |
|
| 79 |
+
# path_in_repo = f"submissions/{method_name}"
|
| 80 |
+
# file_name = f"{method_name}-{organisation}-{datetime.datetime.now().strftime('%Y-%m-%d')}.csv"
|
| 81 |
|
| 82 |
# upload the df to spaces
|
| 83 |
import io
|
| 84 |
|
| 85 |
+
# read the submission json file
|
| 86 |
+
with open(path_to_file, "r") as f:
|
| 87 |
+
submission = json.load(f)
|
| 88 |
|
| 89 |
+
with open("submissions/baseline/results.json", "r") as f:
|
| 90 |
+
results = json.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
+
# update the results
|
| 93 |
+
results.append(submission[0])
|
|
|
|
| 94 |
|
|
|
|
|
|
|
| 95 |
leaderboard_buffer = io.BytesIO()
|
| 96 |
+
# df.to_csv(buffer, index=False) # Write the DataFrame to a buffer in CSV format
|
| 97 |
+
# buffer.seek(0) # Rewind the buffer to the beginning
|
| 98 |
+
|
| 99 |
+
# save the results to buffer
|
| 100 |
+
leaderboard_buffer.write(json.dumps(results).encode())
|
| 101 |
leaderboard_buffer.seek(0)
|
| 102 |
+
|
| 103 |
+
# api.upload_file(
|
| 104 |
+
# repo_id=RESULTS_PATH,
|
| 105 |
+
# path_in_repo=f"{path_in_repo}/{file_name}",
|
| 106 |
+
# path_or_fileobj=buffer,
|
| 107 |
+
# token=TOKEN,
|
| 108 |
+
# repo_type="dataset",
|
| 109 |
+
# )
|
| 110 |
+
# # read the leaderboard
|
| 111 |
+
# leaderboard_df = pd.read_csv(f"submissions/baseline/baseline.csv")
|
| 112 |
+
|
| 113 |
+
# # append the new submission_df csv to the leaderboard
|
| 114 |
+
# # leaderboard_df = leaderboard_df._append(submission_df)
|
| 115 |
+
# # leaderboard_df = pd.concat([leaderboard_df, submission_df], ignore_index=True)
|
| 116 |
+
|
| 117 |
+
# # save the new leaderboard
|
| 118 |
+
# # leaderboard_df.to_csv(f"submissions/baseline/baseline.csv", index=False)
|
| 119 |
+
# leaderboard_buffer = io.BytesIO()
|
| 120 |
+
# leaderboard_df.to_csv(leaderboard_buffer, index=False)
|
| 121 |
+
# leaderboard_buffer.seek(0)
|
| 122 |
+
# with open("submissions/baseline/results.json", "w") as f:
|
| 123 |
+
# json.dump(results, f)
|
| 124 |
+
|
| 125 |
api.upload_file(
|
| 126 |
repo_id=LEADERBOARD_PATH,
|
| 127 |
+
# path_in_repo=f"submissions/baseline/baseline.csv",
|
| 128 |
+
path_in_repo=f"submissions/results.json",
|
| 129 |
path_or_fileobj=leaderboard_buffer,
|
| 130 |
token=TOKEN,
|
| 131 |
repo_type="space",
|