Update src/tasks_content.py
Browse files- src/tasks_content.py +29 -30
src/tasks_content.py
CHANGED
@@ -11,50 +11,49 @@ TASKS_PRETTY = {
|
|
11 |
TASKS_PRETTY_REVERSE = {value: key for key, value in TASKS_PRETTY.items()}
|
12 |
|
13 |
TASKS_DESCRIPTIONS = {
|
14 |
-
"library_based_code_generation": """# Library-
|
15 |
|
16 |
-
Our Library-
|
17 |
|
18 |
-
For evaluation we use two metrics:
|
19 |
-
* `API Recall`: share of library-specific API calls used in the reference program that appear in the generated code,
|
20 |
* `ChrF`: textual similarity between the generated code and the reference program.
|
|
|
21 |
|
22 |
-
For further details on the dataset and the baselines from ποΈ Long Code Arena
|
23 |
""",
|
24 |
|
25 |
-
"ci_builds_repair": """# CI
|
26 |
|
27 |
-
Our CI
|
28 |
|
29 |
-
We use Pass@1 metric for CI repair.
|
30 |
-
We evaluate Exact Match for different line categories:
|
31 |
|
32 |
-
For further details on the dataset and the baselines from ποΈ Long Code Arena
|
33 |
""",
|
34 |
|
35 |
-
"project_code_completion": """# Project-
|
36 |
|
37 |
-
Our Project-
|
38 |
* `small-context`: 144 data points,
|
39 |
* `medium-context`: 224 data points,
|
40 |
* `large-context`: 270 data points,
|
41 |
* `huge-context`: 296 data points.
|
42 |
|
43 |
-
We use standard Exact Match (EM) metric for one-line code completion.
|
44 |
-
We evaluate Exact Match for different line categories:
|
45 |
* *infile* β functions and classes are from the completion file;
|
46 |
-
* *inproject* β functions and files are from the repository snapshot;
|
47 |
* *committed* β functions and classes are from the files that were added on the completion file commit;
|
48 |
* *common* β functions and classes with common names, e.g., `main`, `get`, etc.;
|
49 |
* *non-informative* β short/long lines, import/print lines, or comment lines;
|
50 |
-
* *random* β lines that
|
51 |
|
52 |
-
For further details on the dataset and the baselines from ποΈ Long Code Arena
|
53 |
""",
|
54 |
|
55 |
-
"commit_message_generation": """# Commit
|
56 |
|
57 |
-
Our Commit
|
58 |
|
59 |
We use the following metrics for evaluation:
|
60 |
* [BLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu)
|
@@ -62,27 +61,27 @@ TASKS_DESCRIPTIONS = {
|
|
62 |
* [ChrF](https://huggingface.co/spaces/evaluate-metric/chrf)
|
63 |
* [BERTScore](https://huggingface.co/spaces/evaluate-metric/bertscore)
|
64 |
|
65 |
-
For further details on the dataset and the baselines from ποΈ Long Code Arena
|
66 |
|
67 |
-
**Note.** The leaderboard is sorted by ROUGE-1 metric by default.
|
68 |
""",
|
69 |
|
70 |
-
"bug_localization": """# Bug
|
71 |
|
72 |
-
Our Bug
|
73 |
-
We used information retrieval metrics such as R@k
|
74 |
|
75 |
-
For further details on the dataset and the baselines from ποΈ Long Code Arena
|
76 |
|
77 |
""",
|
78 |
|
79 |
-
"module_summarization": """# Module
|
80 |
-
Our Module
|
81 |
|
82 |
-
We use
|
83 |
-
* `CompScore`:
|
84 |
|
85 |
-
For further details on the dataset and the baselines from ποΈ Long Code Arena
|
86 |
""",
|
87 |
}
|
88 |
|
|
|
11 |
TASKS_PRETTY_REVERSE = {value: key for key, value in TASKS_PRETTY.items()}
|
12 |
|
13 |
TASKS_DESCRIPTIONS = {
|
14 |
+
"library_based_code_generation": """# Library-based code generation\n
|
15 |
|
16 |
+
Our Library-based code generation benchmark π€ [JetBrains-Research/lca-library-based-code-generation](https://huggingface.co/datasets/JetBrains-Research/lca-library-based-code-generation) includes 150 manually curated instructions asking a model to generate Python code using a particular library. Samples come from 62 Python repositories. All the samples in the dataset are based on reference example programs written by authors of the respective libraries.
|
17 |
|
18 |
+
For evaluation, we use two metrics:
|
|
|
19 |
* `ChrF`: textual similarity between the generated code and the reference program.
|
20 |
+
* `API Recall`: share of library-specific API calls used in the reference program that appear in the generated code,
|
21 |
|
22 |
+
For further details on the dataset and the baselines from the ποΈ Long Code Arena team, refer to the `library_based_code_generation` directory in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines).
|
23 |
""",
|
24 |
|
25 |
+
"ci_builds_repair": """# CI builds repair\n
|
26 |
|
27 |
+
Our CI builds repair benchmark π€ [JetBrains-Research/lca-ci-builds-repair](https://huggingface.co/datasets/JetBrains-Research/lca-ci-builds-repair) includes 77 data points.
|
28 |
|
29 |
+
We use the `Pass@1` metric for CI builds repair.
|
|
|
30 |
|
31 |
+
For further details on the dataset and the baselines from the ποΈ Long Code Arena team, refer to the `ci-builds-repair` directory in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines).
|
32 |
""",
|
33 |
|
34 |
+
"project_code_completion": """# Project-level code completion\n
|
35 |
|
36 |
+
Our Project-level code completion benchmark π€ [JetBrains-Research/lca-project-level-code-completion](https://huggingface.co/datasets/JetBrains-Research/lca-project-level-code-completion) includes four sets of samples:
|
37 |
* `small-context`: 144 data points,
|
38 |
* `medium-context`: 224 data points,
|
39 |
* `large-context`: 270 data points,
|
40 |
* `huge-context`: 296 data points.
|
41 |
|
42 |
+
We use standard `Exact Match (EM)` metric for one-line code completion.
|
43 |
+
We evaluate `Exact Match` for different line categories:
|
44 |
* *infile* β functions and classes are from the completion file;
|
45 |
+
* *inproject* β functions and files are from the repository snapshot at the moment of completion;
|
46 |
* *committed* β functions and classes are from the files that were added on the completion file commit;
|
47 |
* *common* β functions and classes with common names, e.g., `main`, `get`, etc.;
|
48 |
* *non-informative* β short/long lines, import/print lines, or comment lines;
|
49 |
+
* *random* β lines that don't fit any of the previous categories.
|
50 |
|
51 |
+
For further details on the dataset and the baselines from the ποΈ Long Code Arena team, refer to the `code_completion` directory in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines).
|
52 |
""",
|
53 |
|
54 |
+
"commit_message_generation": """# Commit message generation\n
|
55 |
|
56 |
+
Our Commit message generation benchmark π€ [JetBrains-Research/lca-commit-message-generation](https://huggingface.co/datasets/JetBrains-Research/lca-commit-message-generation) includes 163 manually curated commits from 34 Python projects.
|
57 |
|
58 |
We use the following metrics for evaluation:
|
59 |
* [BLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu)
|
|
|
61 |
* [ChrF](https://huggingface.co/spaces/evaluate-metric/chrf)
|
62 |
* [BERTScore](https://huggingface.co/spaces/evaluate-metric/bertscore)
|
63 |
|
64 |
+
For further details on the dataset and the baselines from the ποΈ Long Code Arena team, refer to the `commit_message_generation` directory in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines).
|
65 |
|
66 |
+
**Note.** The leaderboard is sorted by the `ROUGE-1` metric by default.
|
67 |
""",
|
68 |
|
69 |
+
"bug_localization": """# Bug localization\n
|
70 |
|
71 |
+
Our Bug localization benchmark π€ [JetBrains-Research/lca-bug-localization](https://huggingface.co/datasets/JetBrains-Research/lca-bug-localization) includes 150 manually verified bug issue descriptions with information about pull request that fix them for Python, Java, and Kotlin projects.
|
72 |
+
We used information retrieval metrics such as `R@k`, `P@k`, `F1-score`, and `MAP` for evaluation, taking `k` equal to 1 and 2.
|
73 |
|
74 |
+
For further details on the dataset and the baselines from the ποΈ Long Code Arena team, refer to the `bug_localization` directory in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines).
|
75 |
|
76 |
""",
|
77 |
|
78 |
+
"module_summarization": """# Module summarization\n
|
79 |
+
Our Module summarization benchmark π€ [JetBrains-Research/lca-module-summarization](https://huggingface.co/datasets/JetBrains-Research/lca-module-summarization) includes 216 manually curated text files describing different documentation of open-source permissive Python projects.
|
80 |
|
81 |
+
We use a novel metric for evaluation:
|
82 |
+
* `CompScore`: a new metric proposed for this task. More details on how it is calculated can be found in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines/blob/main/module_summarization/README.md).
|
83 |
|
84 |
+
For further details on the dataset and the baselines from the ποΈ Long Code Arena team, refer to the `module_summarization` directory in [our baselines repository](https://github.com/JetBrains-Research/lca-baselines/blob/main/module_summarization/).
|
85 |
""",
|
86 |
}
|
87 |
|