File size: 7,281 Bytes
2a0bc63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python

# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
A collection of utilities for comparing `examples/complete_*_example.py` scripts with the capabilities inside of each
`examples/by_feature` example. `compare_against_test` is the main function that should be used when testing, while the
others are used to either get the code that matters, or to preprocess them (such as stripping comments)
"""

import os
from typing import List


def get_function_contents_by_name(lines: List[str], name: str):
    """
    Extracts a function from `lines` of segmented source code with the name `name`.

    Args:
        lines (`List[str]`):
            Source code of a script seperated by line.
        name (`str`):
            The name of the function to extract. Should be either `training_function` or `main`
    """
    if name != "training_function" and name != "main":
        raise ValueError(f"Incorrect function name passed: {name}, choose either 'main' or 'training_function'")
    good_lines, found_start = [], False
    for line in lines:
        if not found_start and f"def {name}" in line:
            found_start = True
            good_lines.append(line)
            continue
        if found_start:
            if name == "training_function" and "def main" in line:
                return good_lines
            if name == "main" and "if __name__" in line:
                return good_lines
            good_lines.append(line)


def clean_lines(lines: List[str]):
    """
    Filters `lines` and removes any entries that start with a comment ('#') or is just a newline ('\n')

    Args:
        lines (`List[str]`):
            Source code of a script seperated by line.
    """
    return [line for line in lines if not line.lstrip().startswith("#") and line != "\n"]


def compare_against_test(base_filename: str, feature_filename: str, parser_only: bool, secondary_filename: str = None):
    """
    Tests whether the additional code inside of `feature_filename` was implemented in `base_filename`. This should be
    used when testing to see if `complete_*_.py` examples have all of the implementations from each of the
    `examples/by_feature/*` scripts.

    It utilizes `nlp_example.py` to extract out all of the repeated training code, so that only the new additional code
    is examined and checked. If something *other* than `nlp_example.py` should be used, such as `cv_example.py` for the
    `complete_cv_example.py` script, it should be passed in for the `secondary_filename` parameter.

    Args:
        base_filename (`str` or `os.PathLike`):
            The filepath of a single "complete" example script to test, such as `examples/complete_cv_example.py`
        feature_filename (`str` or `os.PathLike`):
            The filepath of a single feature example script. The contents of this script are checked to see if they
            exist in `base_filename`
        parser_only (`bool`):
            Whether to compare only the `main()` sections in both files, or to compare the contents of
            `training_loop()`
        secondary_filename (`str`, *optional*):
            A potential secondary filepath that should be included in the check. This function extracts the base
            functionalities off of "examples/nlp_example.py", so if `base_filename` is a script other than
            `complete_nlp_example.py`, the template script should be included here. Such as `examples/cv_example.py`
    """
    with open(base_filename) as f:
        base_file_contents = f.readlines()
    with open(os.path.abspath(os.path.join("examples", "nlp_example.py"))) as f:
        full_file_contents = f.readlines()
    with open(feature_filename) as f:
        feature_file_contents = f.readlines()
    if secondary_filename is not None:
        with open(secondary_filename) as f:
            secondary_file_contents = f.readlines()

    # This is our base, we remove all the code from here in our `full_filename` and `feature_filename` to find the new content
    if parser_only:
        base_file_func = clean_lines(get_function_contents_by_name(base_file_contents, "main"))
        full_file_func = clean_lines(get_function_contents_by_name(full_file_contents, "main"))
        feature_file_func = clean_lines(get_function_contents_by_name(feature_file_contents, "main"))
        if secondary_filename is not None:
            secondary_file_func = clean_lines(get_function_contents_by_name(secondary_file_contents, "main"))
    else:
        base_file_func = clean_lines(get_function_contents_by_name(base_file_contents, "training_function"))
        full_file_func = clean_lines(get_function_contents_by_name(full_file_contents, "training_function"))
        feature_file_func = clean_lines(get_function_contents_by_name(feature_file_contents, "training_function"))
        if secondary_filename is not None:
            secondary_file_func = clean_lines(
                get_function_contents_by_name(secondary_file_contents, "training_function")
            )

    _dl_line = "train_dataloader, eval_dataloader = get_dataloaders(accelerator, batch_size)\n"

    # Specific code in our script that differs from the full version, aka what is new
    new_feature_code = []
    passed_idxs = []  # We keep track of the idxs just in case it's a repeated statement
    it = iter(feature_file_func)
    for i in range(len(feature_file_func) - 1):
        if i not in passed_idxs:
            line = next(it)
            if (line not in full_file_func) and (line.lstrip() != _dl_line):
                if "TESTING_MOCKED_DATALOADERS" not in line:
                    new_feature_code.append(line)
                    passed_idxs.append(i)
                else:
                    # Skip over the `config['num_epochs'] = 2` statement
                    _ = next(it)

    # Extract out just the new parts from the full_file_training_func
    new_full_example_parts = []
    passed_idxs = []  # We keep track of the idxs just in case it's a repeated statement
    for i, line in enumerate(base_file_func):
        if i not in passed_idxs:
            if (line not in full_file_func) and (line.lstrip() != _dl_line):
                if "TESTING_MOCKED_DATALOADERS" not in line:
                    new_full_example_parts.append(line)
                    passed_idxs.append(i)

    # Finally, get the overall diff
    diff_from_example = [line for line in new_feature_code if line not in new_full_example_parts]
    if secondary_filename is not None:
        diff_from_two = [line for line in full_file_contents if line not in secondary_file_func]
        diff_from_example = [line for line in diff_from_example if line not in diff_from_two]

    return diff_from_example