long-code-arena / src /leaderboard_formatting.py
galtimur's picture
Raplaced files with up to date version from BenchName
49c07f7
raw
history blame
4.23 kB
from typing import List
# Common dictionary to map the columns names
COLUMNS_PRETTY = {
"bleu": "BLEU",
"chrf": "ChrF",
"rouge1": "ROUGE-1",
"rouge2": "ROUGE-2",
"rougeL": "ROUGE-L",
"bertscore": "BERTScore",
"bertscore_normalized": "BERTScore (Normalized)",
"model_name": "Model Name",
"model_availability": "Availability",
"urls": "Resources",
"context_size": "Context Size",
"submitted_by": "Submitted By",
"EM infile": "EM infile",
"EM inproject": "EM inproject",
"EM common": "EM common",
"EM commited": "EM committed",
"EM non_informative": "EM non-informative",
"EM random": "EM random",
"EM all": "EM all",
"context_composer": "Context Composer",
"context_length": "Context Size",
"dataset": "Dataset",
"CompScore": "CompScore",
"context": "Context",
"task_type": "Task type",
}
# Add your metrics
METRICS_PER_TASK = {
"aggregated": [
"Mean Rank",
"Mean Score",
"Library-based CG",
"CI builds repair",
"CMG",
"Bug localization",
"Module summarization",
],
"commit_message_generation": [
"BLEU",
"ChrF",
"ROUGE-1",
"ROUGE-2",
"ROUGE-L",
"BERTScore",
"BERTScore (Normalized)",
],
"project_code_completion": [
"EM infile",
"EM inproject",
"EM common",
"EM committed",
"EM non-informative",
"EM random",
"EM all",
],
"bug_localization": [
"P",
"R",
"FPR",
"F1-score",
"All_correct",
"All_incorrect",
"Output_count",
],
"module_summarization": [
"CompScore",
],
"library_based_code_generation": [
"API Recall\nno context",
"API Recall\n20 APIs",
"API Recall\n200 APIs",
"API Recall\n2,000 APIs",
"API Recall\nall APIs",
"ChrF\nno context",
"ChrF\n20 APIs",
"ChrF\n200 APIs",
"ChrF\n2,000 APIs",
"ChrF\nall APIs",
],
"ci_builds_repair": [
"Pass@1",
],
}
SORT_COLUMN_PER_TASK = {
"commit_message_generation": "ROUGE-1",
"project_code_completion": "EM inproject",
"bug_localization": "Model Name",
"module_summarization": "CompScore",
"library_based_code_generation": "API Recall\nall APIs",
"ci_builds_repair": "Pass@1",
}
def get_columns_per_task(task_id: str) -> List[str]:
metrics_per_task = METRICS_PER_TASK[task_id]
if task_id == 'aggregated':
return ["Model Name"] + metrics_per_task
if task_id == 'project_code_completion':
return ["Model Name", "Context Composer", "Context Size", "Dataset Name", "Dataset"] + metrics_per_task + ["Submitted By", "Resources"]
if task_id == 'bug_localization':
return ["Model Name", "Availability", "Context Size"] + metrics_per_task + ["Submitted By", "Resources"]
if task_id == 'module_summarization':
return ["Model Name", "Context Size"] + metrics_per_task + ["Submitted By", "Resources"]
if task_id == 'library_based_code_generation':
return ["Model Name"] + metrics_per_task + ["Availability", "Submitted By", "Resources"]
if task_id == 'ci_builds_repair':
return ["Model Name", "Context Size", "Task type"] + metrics_per_task + ["Availability", "Submitted By", "Resources"]
return ["Model Name", "Context Size", "Availability"] + metrics_per_task + ["Submitted By", "Resources"]
def get_types_per_task(task_id: str) -> List[str]:
metrics_per_task = METRICS_PER_TASK.get(task_id, (0, 0, 0, 0, 0))
if task_id == 'project_code_completion':
return ["html", "markdown", "markdown", "markdown", "html"] + ["number" for _ in metrics_per_task] + ["markdown", "html"]
if task_id == 'bug_localization':
return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "html"]
if task_id == 'ci_builds_repair':
return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "markdown", "html"]
return ["html", "markdown", "markdown"] + ["number" for _ in metrics_per_task] + ["markdown", "html"]