File size: 7,083 Bytes
6c92442
 
 
2fb6d29
 
 
 
 
 
6c92442
 
 
 
84fadb9
 
 
 
 
 
 
 
 
 
9cb627e
fe82027
 
44436a5
fe82027
44436a5
fe82027
 
44436a5
fe82027
9cb627e
 
 
74686c9
9cb627e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84fadb9
6c92442
 
eca78b2
6c92442
 
 
 
 
 
 
eca78b2
04f40cd
 
6c92442
84fadb9
aa6b5d3
 
c7823aa
 
 
 
aa6b5d3
 
84fadb9
8f69df6
5f49613
0fc7c7a
8f69df6
ec7e6c9
0fc7c7a
ec7e6c9
0fc7c7a
6c92442
 
 
 
 
 
 
 
 
 
 
84fadb9
6c92442
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from typing import Optional

TASKS_PRETTY = {
    "library_based_code_generation": "Library-based code generation",
    "ci_builds_repair": "CI builds repair",
    "project_code_completion": "Project-level code completion",
    "commit_message_generation": "Commit message generation",
    "bug_localization": "Bug localization",
    "module_summarization": "Module Summarization",
}
TASKS_PRETTY_REVERSE = {value: key for key, value in TASKS_PRETTY.items()}

TASKS_DESCRIPTIONS = {
    "library_based_code_generation": """# Library-Based Code Generation\n
        
        Our Library-Based Code Generation benchmark πŸ€— [JetBrains-Research/lca-library-based-code-generation](https://huggingface.co/datasets/JetBrains-Research/lca-library-based-code-generation) includes 150 manually curated instructions asking model to generate Python code using a particular library. Samples come from 62 Python repositories. All the samples in the dataset are based on reference example programs written by authors of the respective libraries.
        
        For evaluation we use two metrics:
        * `API Recall`: share of library-specific API calls used in the reference program that appear in the generated code,  
        * `ChrF`: textual similarity between the generated code and the reference program.  

        For further details on the dataset and the baselines from 🏟️ Long Code Arena Team, refer to `library_based_code_generation` folder in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines) or to our preprint (TODO).
        """,

    "ci_builds_repair": """# CI Builds Repair\n
        
        Our CI Builds Repair benchmark πŸ€— [JetBrains-Research/lca-ci-builds-repair](https://huggingface.co/datasets/JetBrains-Research/lca-ci-builds-repair) includes 77 data points.

        We use Pass@1 metric for CI repair.
        We evaluate Exact Match for different line categories:

        For further details on the dataset and the baselines from 🏟️ Long Code Arena Team, refer to `ci-builds-repair` folder in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines) or to our preprint.
        """,

    "project_code_completion": """# Project-Level Code Completion\n
        
        Our Project-Level Code Completion benchmark πŸ€— [JetBrains-Research/lca-project-level-code-completion](https://huggingface.co/datasets/JetBrains-Research/lca-project-level-code-completion) includes four datasets:
        * `small-context`: 144 data points,
        * `medium-context`: 224 data points,
        * `large-context`: 270 data points,
        * `huge-context`: 296 data points.

        We use standard Exact Match (EM) metric for one-line code completion.
        We evaluate Exact Match for different line categories:
        * *infile* – functions and classes are from the completion file;
        * *inproject* – functions and files are from the repository snapshot;
        * *committed* – functions and classes are from the files that were added on the completion file commit;
        * *common* – functions and classes with common names, e.g., `main`, `get`, etc.;
        * *non-informative* – short/long lines, import/print lines, or comment lines;
        * *random* – lines that doesn't fit to any of previous categories.

        For further details on the dataset and the baselines from 🏟️ Long Code Arena Team, refer to `code_completion` folder in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines) or to our preprint (TODO).
        """,

    "commit_message_generation": """# Commit Message Generation\n
        
        Our Commit Message Generation benchmark πŸ€— [JetBrains-Research/lca-commit-message-generation](https://huggingface.co/datasets/JetBrains-Research/lca-commit-message-generation) includes 163 manually curated commits from 34 Python projects.  
        
        We use the following metrics for evaluation:
        * [BLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu)
        * [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge)
        * [ChrF](https://huggingface.co/spaces/evaluate-metric/chrf)
        * [BERTScore](https://huggingface.co/spaces/evaluate-metric/bertscore)
        
        For further details on the dataset and the baselines from 🏟️ Long Code Arena Team, refer to `commit_message_generation` folder in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines) or to our preprint.
        
        **Note.** The leaderboard is sorted by ROUGE-1 metric by default.       
        """,

    "bug_localization": """# Bug Localization\n
        
        Our Bug Localization benchmark πŸ€— [JetBrains-Research/lca-bug-localization](https://huggingface.co/datasets/JetBrains-Research/lca-bug-localization) includes 150 manually verified bug issue descriptions with information about pull request that fix them for Python, Java and Kotlin projects. 
        We used information retrieval metrics such as R@k, P@k, F1-score and MAP for evaluation, taking k equal to 1 and 2.

        For further details on the dataset and the baselines from 🏟️ Long Code Arena Team, refer to `bug_localization` folder in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines/blob/main/bug_localization/).
        
    """,

    "module_summarization": """# Module Summarization\n
        Our Module Summarization benchmark πŸ€— [JetBrains-Research/lca-module-summarization](https://huggingface.co/datasets/JetBrains-Research/lca-module-summarization) includes 216 manually curated text files describing different documentation of opensource permissive Python projects. 

        We use new metric for evaluation:
        * `CompScore`: New metric proposed for this task. More details in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines/blob/main/module_summarization/README.md)

        For further details on the dataset and the baselines from 🏟️ Long Code Arena Team, refer to `module_summarization` folder in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines/blob/main/module_summarization/).
        """,
}


def get_submission_text_files_for_task(task_pretty: Optional[str]) -> str:
    if not task_pretty:
        return "Please, select a specific task to see more detailed instructions regarding submitting files."

    task_id = TASKS_PRETTY_REVERSE[task_pretty]

    if task_id == "commit_message_generation":
        return f"""**{task_pretty} Instructions:**\n\n* Please, attach files in [JSONLines format](https://jsonlines.org/). For an example, check the predictions provided by 🏟️ Long Code Arena Team in  πŸ€— [JetBrains-Research/lca-results](https://huggingface.co/datasets/JetBrains-Research/lca-results/tree/main/commit_message_generation/predictions). Make sure to include `"prediction"` and `"reference"` fields for each example, the rest are optional."""

    return f"**{task_pretty} Instructions:**\n\n* 🚧 There are no instructions for the current task yet."