Spaces:
Runtime error
Runtime error
Commit codes
Browse files- __init__.py +0 -0
- app/__init__.py +0 -0
- app/components.py +217 -0
- app/configuration.py +17 -0
- app/main.py +36 -0
- app/utils.py +216 -0
- requirements.txt +4 -0
- templates/Accelerate/task_templates/fill-mask.py.jinja +271 -0
- templates/Accelerate/task_templates/text-generation.py.jinja +207 -0
- templates/Accelerate/task_templates/translation.py.jinja +287 -0
- templates/Trainer/task_templates/fill-mask.py.jinja +185 -0
- templates/Trainer/task_templates/text-generation.py.jinja +152 -0
- templates/Trainer/task_templates/translation.py.jinja +218 -0
__init__.py
ADDED
File without changes
|
app/__init__.py
ADDED
File without changes
|
app/components.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import collections
|
2 |
+
import os
|
3 |
+
from typing import Dict
|
4 |
+
|
5 |
+
import streamlit as st
|
6 |
+
from datasets import get_dataset_config_names
|
7 |
+
from jinja2 import Environment, FileSystemLoader
|
8 |
+
|
9 |
+
import utils
|
10 |
+
from configuration import OPTIMIZERS_ACCELERATE, OPTIMIZERS_TRAINER, TASKS, TASKS_TO_PIPELINE_TAG
|
11 |
+
from utils import (get_dataset_infos_dict, get_datasets, get_model_to_model_id,
|
12 |
+
render_features)
|
13 |
+
|
14 |
+
|
15 |
+
def show_API_component(inputs: Dict[str, str]) -> Dict[str, str]:
|
16 |
+
template_dict = collections.defaultdict()
|
17 |
+
template_dirs = [
|
18 |
+
f for f in os.scandir("templates") if f.is_dir() and f.name != "example"
|
19 |
+
]
|
20 |
+
template_dirs = sorted(template_dirs, key=lambda e: e.name)
|
21 |
+
for template_dir in template_dirs:
|
22 |
+
template_dict[template_dir.name] = template_dir.path
|
23 |
+
st.write("## API")
|
24 |
+
inputs['api'] = st.selectbox(
|
25 |
+
|
26 |
+
"Which Hugging Face API do you want to use?", list(template_dict.keys())
|
27 |
+
)
|
28 |
+
inputs['template_dir'] = template_dict.get(inputs['api'])
|
29 |
+
return inputs
|
30 |
+
|
31 |
+
|
32 |
+
def show_model_component(inputs: Dict[str, str]) -> Dict[str, str]:
|
33 |
+
|
34 |
+
model_info = get_model_to_model_id()
|
35 |
+
models = model_info['model_to_model_id']
|
36 |
+
models_pipeline = model_info["model_to_pipeline_tag"]
|
37 |
+
st.write("## Model")
|
38 |
+
models_for_task = []
|
39 |
+
for model in models:
|
40 |
+
if (models_pipeline[model] == inputs["nlp_task"]):
|
41 |
+
models_for_task.append(model)
|
42 |
+
model = st.selectbox("Which model?", list(models_for_task))
|
43 |
+
inputs["model_checkpoint"] = models.get(model)
|
44 |
+
inputs["pretrained"] = st.checkbox("Use pre-trained model")
|
45 |
+
return inputs
|
46 |
+
|
47 |
+
|
48 |
+
def show_task_component(inputs: Dict[str, str]) -> Dict[str, str]:
|
49 |
+
st.write("## Task")
|
50 |
+
task = st.selectbox("Which task?", TASKS)
|
51 |
+
inputs["task"] = task
|
52 |
+
inputs["nlp_task"] = st.selectbox(
|
53 |
+
"Which NLP task?", TASKS_TO_PIPELINE_TAG[task])
|
54 |
+
return inputs
|
55 |
+
|
56 |
+
|
57 |
+
def show_input_data_component(inputs: Dict[str, str]) -> Dict[str, str]:
|
58 |
+
st.write("## Input data")
|
59 |
+
english_datasets = get_datasets()
|
60 |
+
english_datasets_for_task = []
|
61 |
+
|
62 |
+
for dataset in english_datasets:
|
63 |
+
for task_category in english_datasets[dataset]:
|
64 |
+
if task_category == inputs["nlp_task"]:
|
65 |
+
english_datasets_for_task.append(dataset)
|
66 |
+
continue
|
67 |
+
|
68 |
+
inputs["dataset"] = st.selectbox(
|
69 |
+
"Which one?", tuple(english_datasets_for_task)
|
70 |
+
)
|
71 |
+
|
72 |
+
configs = get_dataset_config_names(inputs["dataset"])
|
73 |
+
inputs["subset"] = st.selectbox("Which subset?", list(configs))
|
74 |
+
|
75 |
+
data_info_dict = get_dataset_infos_dict(
|
76 |
+
inputs["dataset"], inputs["subset"])
|
77 |
+
|
78 |
+
assert data_info_dict.splits is not None
|
79 |
+
if 'train' in list(data_info_dict.splits.keys()):
|
80 |
+
train_index = list(data_info_dict.splits.keys()).index('train')
|
81 |
+
else:
|
82 |
+
train_index = 0
|
83 |
+
|
84 |
+
inputs["train"] = st.selectbox("Which split for training?", list(
|
85 |
+
data_info_dict.splits.keys()), index=train_index)
|
86 |
+
|
87 |
+
if 'validation' in list(data_info_dict.splits.keys()):
|
88 |
+
validation_index = list(
|
89 |
+
data_info_dict.splits.keys()).index('validation')
|
90 |
+
else:
|
91 |
+
validation_index = len(list(data_info_dict.splits.keys()))-1
|
92 |
+
|
93 |
+
inputs["validation"] = st.selectbox("Which split for validation?", list(
|
94 |
+
data_info_dict.splits.keys()), index=validation_index)
|
95 |
+
|
96 |
+
assert data_info_dict.features is not None
|
97 |
+
feature_index = 0
|
98 |
+
if inputs["nlp_task"] == 'translation':
|
99 |
+
if 'translation' in list(data_info_dict.features.keys()):
|
100 |
+
feature_index = list(
|
101 |
+
data_info_dict.features.keys()).index('translation')
|
102 |
+
|
103 |
+
inputs["feature"] = st.selectbox(
|
104 |
+
"Which data feature?", list(data_info_dict.features.keys()), feature_index)
|
105 |
+
|
106 |
+
if inputs["feature"] == 'translation':
|
107 |
+
inputs["source_language"] = st.selectbox(
|
108 |
+
"Which language for source?", list(data_info_dict.features['translation'].languages))
|
109 |
+
inputs["target_language"] = st.selectbox(
|
110 |
+
"Which language for target?", list(data_info_dict.features['translation'].languages))
|
111 |
+
|
112 |
+
return inputs
|
113 |
+
|
114 |
+
|
115 |
+
def show_preprocessing_component(inputs: Dict[str, str]) -> Dict[str, str]:
|
116 |
+
st.write("## Preprocessing")
|
117 |
+
inputs["block_size"] = st.number_input(
|
118 |
+
"The length of each block (i.e. context size)", 1, None, 128)
|
119 |
+
|
120 |
+
if inputs["task"] == "MaskedLM":
|
121 |
+
inputs["mlm_probability"] = st.number_input(
|
122 |
+
"The probability with which to (randomly) mask tokens in the input", 0.0, 1.00, 0.15)
|
123 |
+
inputs["whole_word_masking"] = st.checkbox(
|
124 |
+
"Use whole word masking")
|
125 |
+
return inputs
|
126 |
+
|
127 |
+
|
128 |
+
def show_training_comoponent(inputs: Dict[str, str]) -> Dict[str, str]:
|
129 |
+
st.write("## Training")
|
130 |
+
|
131 |
+
# inputs['with_tracker'] = st.selectbox(
|
132 |
+
# "Loggers to monitor the training ", ["none", "all", "tensorboard", "wandb", "comet_ml"])
|
133 |
+
inputs["seed"] = st.number_input(
|
134 |
+
"Seed", 1, None, 4)
|
135 |
+
|
136 |
+
if inputs['api'] == 'Accelerate':
|
137 |
+
optimizer_dict_to_use = OPTIMIZERS_ACCELERATE
|
138 |
+
else:
|
139 |
+
optimizer_dict_to_use = OPTIMIZERS_TRAINER
|
140 |
+
|
141 |
+
inputs["optimizer"] = st.selectbox(
|
142 |
+
"Optimizer", list(optimizer_dict_to_use.keys()))
|
143 |
+
default_lr = optimizer_dict_to_use[inputs["optimizer"]]
|
144 |
+
inputs["lr"] = st.number_input(
|
145 |
+
"Learning rate", 0.000, None, default_lr, format="%f"
|
146 |
+
)
|
147 |
+
inputs["use_weight_decay"] = st.checkbox("Use weight decay")
|
148 |
+
if inputs["use_weight_decay"]:
|
149 |
+
inputs["weight_decay"] = st.number_input(
|
150 |
+
"Weight decay", 0.000, None, 0.01, format="%f"
|
151 |
+
)
|
152 |
+
|
153 |
+
inputs["gradient_accumulation_steps"] = st.number_input(
|
154 |
+
"Gradient Accumulation Steps", 1, None, 8)
|
155 |
+
|
156 |
+
inputs['lr_scheduler_type'] = st.selectbox(
|
157 |
+
"The scheduler type to use", ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"])
|
158 |
+
inputs['num_warmup_steps'] = st.number_input(
|
159 |
+
"Num warmup steps", 0, None, 0)
|
160 |
+
inputs["batch_size"] = st.number_input("Batch size", 1, None, 32)
|
161 |
+
inputs["num_epochs"] = st.number_input("Epochs", 1, None, 3)
|
162 |
+
return inputs
|
163 |
+
|
164 |
+
|
165 |
+
def show_datset_view_component(inputs: Dict[str, str]) -> Dict[str, str]:
|
166 |
+
data_info_dict = get_dataset_infos_dict(
|
167 |
+
inputs["dataset"], inputs["subset"])
|
168 |
+
st.write(f'## Dataset view: {inputs["dataset"]}/{inputs["subset"]}')
|
169 |
+
st.markdown(
|
170 |
+
"*Homepage*: "
|
171 |
+
+ data_info_dict.homepage
|
172 |
+
+ "\n\n*Dataset*: https://github.com/huggingface/datasets/blob/master/datasets/%s/%s.py"
|
173 |
+
% (inputs["dataset"], inputs["dataset"])
|
174 |
+
)
|
175 |
+
s = []
|
176 |
+
s .append('dataset' + "=" + inputs["dataset"])
|
177 |
+
s.append('config' + "=" + inputs["subset"])
|
178 |
+
st.markdown(
|
179 |
+
"*Permalink*: https://huggingface.co/datasets/viewer/?"
|
180 |
+
+ "&".join(s)
|
181 |
+
)
|
182 |
+
# https://github.com/huggingface/datasets-viewer/blob/master/run.py#L282
|
183 |
+
st.write(f'{data_info_dict.description}')
|
184 |
+
st.write(render_features(data_info_dict.features))
|
185 |
+
# TODO make a conditional if the size of the data is too big, switch to streaming mode
|
186 |
+
# TODO cashe this part of the code
|
187 |
+
# selected_dataset = load_dataset(
|
188 |
+
# inputs["dataset"], inputs["subset"], split=inputs["train"], streaming=True)
|
189 |
+
# print(selected_dataset)
|
190 |
+
# print(next(iter(selected_dataset)))
|
191 |
+
return inputs
|
192 |
+
|
193 |
+
def show_code_component(inputs: Dict[str, str]) -> Dict[str, str]:
|
194 |
+
# Generate code and notebook based on template.py.jinja file in the template dir.
|
195 |
+
env = Environment(
|
196 |
+
loader=FileSystemLoader(inputs['template_dir']), trim_blocks=True, lstrip_blocks=True,
|
197 |
+
)
|
198 |
+
|
199 |
+
template = env.get_template(f'task_templates/{inputs["nlp_task"]}.py.jinja')
|
200 |
+
code = template.render(header=utils.code_header, notebook=False, **inputs)
|
201 |
+
notebook_code = template.render(
|
202 |
+
header=utils.notebook_header, notebook=True, **inputs)
|
203 |
+
|
204 |
+
notebook = utils.to_notebook(notebook_code)
|
205 |
+
|
206 |
+
st.write(f'## Code view: {inputs["api"]}')
|
207 |
+
st.write("") # add vertical space
|
208 |
+
col1, col2 = st.beta_columns(2)
|
209 |
+
with col1:
|
210 |
+
utils.download_button(code, "generated-code.py", "🐍 Download (.py)")
|
211 |
+
with col2:
|
212 |
+
utils.download_button(
|
213 |
+
notebook, "generated-notebook.ipynb", "📓 Download (.ipynb)")
|
214 |
+
colab_error = st.empty()
|
215 |
+
# Display code.
|
216 |
+
st.code(code)
|
217 |
+
return inputs
|
app/configuration.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
INCLUDED_USERS = ['google', 'EleutherAI',
|
2 |
+
"Helsinki-NLP", "bigscience", "facebook", "openai", "microsoft"]
|
3 |
+
|
4 |
+
#TODO create a tempalte for text2text-generation
|
5 |
+
# TASKS_TO_PIPELINE_TAG = {
|
6 |
+
# "CausalLM": ['text-generation'], "MaskedLM": ["fill-mask"], "Seq2SeqLM": ['text2text-generation', 'translation']}
|
7 |
+
TASKS_TO_PIPELINE_TAG = {
|
8 |
+
"CausalLM": ['text-generation'], "MaskedLM": ["fill-mask"], "Seq2SeqLM": ['translation']}
|
9 |
+
|
10 |
+
|
11 |
+
TASKS = list(TASKS_TO_PIPELINE_TAG.keys())
|
12 |
+
|
13 |
+
OPTIMIZERS_ACCELERATE = {
|
14 |
+
"AdamW": 0.0001, "Adadelta": 1.0, "Adagrad": 0.01, "Adam": 0.001, "SparseAdam": 0.001, "Adamax": 0.002, "ASGD": 0.01, "LBFGS": 1.0, "NAdam": 0.002, "RAdam": 0.001, "RMSprop": 0.01, "Rprop": 0.01, "SGD": 0.01
|
15 |
+
}
|
16 |
+
|
17 |
+
OPTIMIZERS_TRAINER = {'adamw_hf': 0.0001, 'adamw_torch': 0.0001, 'adamw_apex_fused': 0.0001, 'adafactor': 0.0001}
|
app/main.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
from components import (show_API_component, show_code_component,
|
4 |
+
show_datset_view_component, show_input_data_component,
|
5 |
+
show_model_component, show_preprocessing_component,
|
6 |
+
show_task_component, show_training_comoponent)
|
7 |
+
|
8 |
+
st.set_page_config(
|
9 |
+
page_title="Training Code Generator for Hugging Face Models ", layout="wide"
|
10 |
+
)
|
11 |
+
|
12 |
+
st.markdown("<br>", unsafe_allow_html=True)
|
13 |
+
|
14 |
+
"""
|
15 |
+
# Training Code Generator for Hugging Face Models 🤗
|
16 |
+
"""
|
17 |
+
st.markdown("<br>", unsafe_allow_html=True)
|
18 |
+
"""
|
19 |
+
---
|
20 |
+
"""
|
21 |
+
|
22 |
+
inputs = {}
|
23 |
+
|
24 |
+
with st.sidebar:
|
25 |
+
st.info(
|
26 |
+
"**Select the configuration**"
|
27 |
+
)
|
28 |
+
inputs = show_API_component(inputs)
|
29 |
+
inputs = show_task_component(inputs)
|
30 |
+
inputs = show_model_component(inputs)
|
31 |
+
inputs = show_input_data_component(inputs)
|
32 |
+
inputs = show_preprocessing_component(inputs)
|
33 |
+
inputs = show_training_comoponent(inputs)
|
34 |
+
|
35 |
+
inputs = show_datset_view_component(inputs)
|
36 |
+
inputs = show_code_component(inputs)
|
app/utils.py
ADDED
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import importlib.util
|
3 |
+
import math
|
4 |
+
import re
|
5 |
+
import uuid
|
6 |
+
from types import ModuleType
|
7 |
+
from typing import Dict
|
8 |
+
|
9 |
+
import datasets
|
10 |
+
import jupytext
|
11 |
+
import requests
|
12 |
+
import streamlit as st
|
13 |
+
from datasets import DatasetInfo, get_dataset_infos
|
14 |
+
from datasets.info import DatasetInfosDict
|
15 |
+
|
16 |
+
from configuration import INCLUDED_USERS, TASKS_TO_PIPELINE_TAG
|
17 |
+
|
18 |
+
|
19 |
+
def import_from_file(module_name: str, filepath: str) -> ModuleType:
|
20 |
+
"""
|
21 |
+
Imports a module from file.
|
22 |
+
Args:
|
23 |
+
module_name (str): Assigned to the module's __name__ parameter (does not
|
24 |
+
influence how the module is named outside of this function)
|
25 |
+
filepath (str): Path to the .py file
|
26 |
+
Returns:
|
27 |
+
The module
|
28 |
+
"""
|
29 |
+
spec = importlib.util.spec_from_file_location(module_name, filepath)
|
30 |
+
module = importlib.util.module_from_spec(spec)
|
31 |
+
spec.loader.exec_module(module)
|
32 |
+
return module
|
33 |
+
|
34 |
+
|
35 |
+
def notebook_header(text: str):
|
36 |
+
"""
|
37 |
+
Insert section header into a jinja file, formatted as notebook cell.
|
38 |
+
|
39 |
+
Leave 2 blank lines before the header.
|
40 |
+
"""
|
41 |
+
return f"""# # {text}
|
42 |
+
"""
|
43 |
+
|
44 |
+
|
45 |
+
def code_header(text: str):
|
46 |
+
"""
|
47 |
+
Insert section header into a jinja file, formatted as Python comment.
|
48 |
+
|
49 |
+
Leave 2 blank lines before the header.
|
50 |
+
"""
|
51 |
+
seperator_len = (75 - len(text)) / 2
|
52 |
+
seperator_len_left = math.floor(seperator_len)
|
53 |
+
seperator_len_right = math.ceil(seperator_len)
|
54 |
+
return f"# {'-' * seperator_len_left} {text} {'-' * seperator_len_right}"
|
55 |
+
|
56 |
+
|
57 |
+
def to_notebook(code: str) -> str:
|
58 |
+
"""Converts Python code to Jupyter notebook format."""
|
59 |
+
notebook = jupytext.reads(code, fmt="py")
|
60 |
+
# print(jupytext.writes(notebook, fmt="ipynb"))
|
61 |
+
return jupytext.writes(notebook, fmt="ipynb")
|
62 |
+
|
63 |
+
|
64 |
+
def download_button(
|
65 |
+
object_to_download: str, download_filename: str, button_text: str # , pickle_it=False
|
66 |
+
):
|
67 |
+
"""
|
68 |
+
Generates a link to download the given object_to_download.
|
69 |
+
|
70 |
+
From: https://discuss.streamlit.io/t/a-download-button-with-custom-css/4220
|
71 |
+
Params:
|
72 |
+
------
|
73 |
+
object_to_download: The object to be downloaded.
|
74 |
+
download_filename (str): filename and extension of file. e.g. mydata.csv,
|
75 |
+
some_txt_output.txt download_link_text (str): Text to display for download
|
76 |
+
link.
|
77 |
+
button_text (str): Text to display on download button (e.g. 'click here to download file')
|
78 |
+
pickle_it (bool): If True, pickle file.
|
79 |
+
Returns:
|
80 |
+
-------
|
81 |
+
(str): the anchor tag to download object_to_download
|
82 |
+
Examples:
|
83 |
+
--------
|
84 |
+
download_link(your_df, 'YOUR_DF.csv', 'Click to download data!')
|
85 |
+
download_link(your_str, 'YOUR_STRING.txt', 'Click to download text!')
|
86 |
+
"""
|
87 |
+
|
88 |
+
# try:
|
89 |
+
# # some strings <-> bytes conversions necessary here
|
90 |
+
b64 = base64.b64encode(object_to_download.encode()).decode()
|
91 |
+
# except AttributeError:
|
92 |
+
# b64 = base64.b64encode(object_to_download).decode()
|
93 |
+
|
94 |
+
button_uuid = str(uuid.uuid4()).replace("-", "")
|
95 |
+
button_id = re.sub("\d+", "", button_uuid)
|
96 |
+
|
97 |
+
custom_css = f"""
|
98 |
+
<style>
|
99 |
+
#{button_id} {{
|
100 |
+
display: inline-flex;
|
101 |
+
align-items: center;
|
102 |
+
justify-content: center;
|
103 |
+
background-color: rgb(255, 255, 255);
|
104 |
+
color: rgb(38, 39, 48);
|
105 |
+
padding: .25rem .75rem;
|
106 |
+
position: relative;
|
107 |
+
text-decoration: none;
|
108 |
+
border-radius: 4px;
|
109 |
+
border-width: 1px;
|
110 |
+
border-style: solid;
|
111 |
+
border-color: rgb(230, 234, 241);
|
112 |
+
border-image: initial;
|
113 |
+
}}
|
114 |
+
#{button_id}:hover {{
|
115 |
+
border-color: rgb(246, 51, 102);
|
116 |
+
color: rgb(246, 51, 102);
|
117 |
+
}}
|
118 |
+
#{button_id}:active {{
|
119 |
+
box-shadow: none;
|
120 |
+
background-color: rgb(246, 51, 102);
|
121 |
+
color: white;
|
122 |
+
}}
|
123 |
+
</style> """
|
124 |
+
|
125 |
+
dl_link = (
|
126 |
+
custom_css
|
127 |
+
+ f'<a download="{download_filename}" id="{button_id}" href="data:file/txt;base64,{b64}">{button_text}</a><br><br>'
|
128 |
+
)
|
129 |
+
|
130 |
+
st.markdown(dl_link, unsafe_allow_html=True)
|
131 |
+
|
132 |
+
|
133 |
+
@st.cache
|
134 |
+
def get_model_to_model_id() -> Dict[str, Dict[str, str]]:
|
135 |
+
requests.get("https://huggingface.co")
|
136 |
+
response = requests.get("https://huggingface.co/api/models")
|
137 |
+
tags = response.json()
|
138 |
+
model_to_model_id = {}
|
139 |
+
model_to_pipeline_tag = {}
|
140 |
+
|
141 |
+
for model in tags:
|
142 |
+
model_name = model['modelId']
|
143 |
+
is_community_model = "/" in model_name
|
144 |
+
if is_community_model:
|
145 |
+
user = model_name.split("/")[0]
|
146 |
+
if user not in INCLUDED_USERS:
|
147 |
+
continue
|
148 |
+
|
149 |
+
# TODO Right now if pipiline is not defined, skip
|
150 |
+
if "pipeline_tag" in model:
|
151 |
+
model_to_model_id[model['id']] = model['modelId']
|
152 |
+
model_to_pipeline_tag[model['id']] = model["pipeline_tag"]
|
153 |
+
return {"model_to_model_id": model_to_model_id, "model_to_pipeline_tag": model_to_pipeline_tag}
|
154 |
+
|
155 |
+
|
156 |
+
@st.cache
|
157 |
+
def get_datasets() -> Dict[str, str]:
|
158 |
+
english_datasets = {}
|
159 |
+
response = requests.get(
|
160 |
+
"https://huggingface.co/api/datasets?full=true&languages=en")
|
161 |
+
tags = response.json()
|
162 |
+
for dataset in tags:
|
163 |
+
dataset_name = dataset["id"]
|
164 |
+
|
165 |
+
is_community_dataset = "/" in dataset_name
|
166 |
+
if is_community_dataset:
|
167 |
+
# user = dataset_name.split("/")[0]
|
168 |
+
# if user in INCLUDED_USERS:
|
169 |
+
# english_datasets.append(dataset_name)
|
170 |
+
continue
|
171 |
+
|
172 |
+
if "cardData" not in dataset:
|
173 |
+
continue
|
174 |
+
metadata = dataset["cardData"]
|
175 |
+
|
176 |
+
if "languages" not in metadata:
|
177 |
+
continue
|
178 |
+
|
179 |
+
if "task_categories" not in metadata:
|
180 |
+
continue
|
181 |
+
|
182 |
+
task_is_valid = False
|
183 |
+
for task_category in metadata["task_categories"]:
|
184 |
+
|
185 |
+
if any(task_category in task for task in list(TASKS_TO_PIPELINE_TAG.values())):
|
186 |
+
task_is_valid = True
|
187 |
+
if not task_is_valid:
|
188 |
+
continue
|
189 |
+
|
190 |
+
languages = metadata["languages"]
|
191 |
+
|
192 |
+
if "en" in languages or "en-US" in languages:
|
193 |
+
english_datasets[dataset_name] = metadata["task_categories"]
|
194 |
+
return english_datasets
|
195 |
+
|
196 |
+
|
197 |
+
@st.cache
|
198 |
+
def get_dataset_infos_dict(dataset: str, subset: str) -> DatasetInfo:
|
199 |
+
return DatasetInfosDict(get_dataset_infos(dataset))[subset]
|
200 |
+
|
201 |
+
# https://github.com/huggingface/datasets-viewer/blob/master/run.py#L49
|
202 |
+
|
203 |
+
|
204 |
+
def render_features(features):
|
205 |
+
# TODO redner translation object with the languages tags
|
206 |
+
if isinstance(features, dict):
|
207 |
+
return {k: render_features(v) for k, v in features.items()}
|
208 |
+
if isinstance(features, datasets.features.ClassLabel):
|
209 |
+
return features.names
|
210 |
+
|
211 |
+
if isinstance(features, datasets.features.Value):
|
212 |
+
return features.dtype
|
213 |
+
|
214 |
+
if isinstance(features, datasets.features.Sequence):
|
215 |
+
return {"[]": render_features(features.feature)}
|
216 |
+
return features
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
datasets
|
3 |
+
jupytext
|
4 |
+
Jinja2
|
templates/Accelerate/task_templates/fill-mask.py.jinja
ADDED
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Before running, install required packages:
|
2 |
+
{% if notebook %}
|
3 |
+
|
4 |
+
!
|
5 |
+
{%- else %}
|
6 |
+
#
|
7 |
+
{%- endif %}
|
8 |
+
pip install datasets transformers[sentencepiece] accelerate
|
9 |
+
|
10 |
+
import collections
|
11 |
+
import logging
|
12 |
+
import math
|
13 |
+
|
14 |
+
import datasets
|
15 |
+
import numpy as np
|
16 |
+
import torch
|
17 |
+
import transformers
|
18 |
+
from accelerate import Accelerator
|
19 |
+
from accelerate.logging import get_logger
|
20 |
+
from accelerate.utils import set_seed
|
21 |
+
from codecarbon import EmissionsTracker
|
22 |
+
from datasets import load_dataset
|
23 |
+
from torch.optim import {{ optimizer }}
|
24 |
+
from torch.utils.data import DataLoader
|
25 |
+
from torch.utils.data.dataloader import DataLoader
|
26 |
+
from tqdm.auto import tqdm
|
27 |
+
from transformers import (AutoConfig, AutoModelForMaskedLM, AutoTokenizer,
|
28 |
+
DataCollatorForLanguageModeling, Trainer,
|
29 |
+
TrainingArguments, default_data_collator,
|
30 |
+
get_scheduler)
|
31 |
+
from transformers.utils.versions import require_version
|
32 |
+
|
33 |
+
{{ header("Setup") }}
|
34 |
+
|
35 |
+
tracker = EmissionsTracker(log_level='error')
|
36 |
+
tracker.start()
|
37 |
+
|
38 |
+
logger = get_logger(__name__)
|
39 |
+
require_version("datasets>=1.8.0")
|
40 |
+
|
41 |
+
accelerator = Accelerator()
|
42 |
+
set_seed({{ seed }})
|
43 |
+
|
44 |
+
logging.basicConfig(
|
45 |
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
46 |
+
datefmt="%m/%d/%Y %H:%M:%S",
|
47 |
+
level=logging.ERROR,
|
48 |
+
)
|
49 |
+
logger.info(accelerator.state, main_process_only=False)
|
50 |
+
if accelerator.is_local_main_process:
|
51 |
+
datasets.utils.logging.set_verbosity_warning()
|
52 |
+
transformers.utils.logging.set_verbosity_info()
|
53 |
+
else:
|
54 |
+
datasets.utils.logging.set_verbosity_error()
|
55 |
+
transformers.utils.logging.set_verbosity_error()
|
56 |
+
|
57 |
+
{{ header("Load model and dataset") }}
|
58 |
+
|
59 |
+
{% if subset == 'default' %}
|
60 |
+
datasets = load_dataset('{{dataset}}')
|
61 |
+
{% else %}
|
62 |
+
datasets = load_dataset('{{dataset}}', '{{ subset }}')
|
63 |
+
{% endif %}
|
64 |
+
model_checkpoint = "{{model_checkpoint}}"
|
65 |
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
|
66 |
+
{% if pretrained %}
|
67 |
+
model = AutoModelFor{{task}}.from_pretrained(model_checkpoint)
|
68 |
+
{% else %}
|
69 |
+
config = AutoConfig.from_pretrained(model_checkpoint)
|
70 |
+
model = AutoModelFor{{task}}.from_config(config)
|
71 |
+
{% endif %}
|
72 |
+
model.resize_token_embeddings(len(tokenizer))
|
73 |
+
model_name = model_checkpoint.split("/")[-1]
|
74 |
+
if tokenizer.pad_token is None:
|
75 |
+
tokenizer.pad_token = tokenizer.eos_token
|
76 |
+
|
77 |
+
{{ header("Preprocessing") }}
|
78 |
+
|
79 |
+
def tokenize_function(examples):
|
80 |
+
result = tokenizer(examples["{{ feature }}"])
|
81 |
+
{% if task=="MaskedLM" %}
|
82 |
+
{% if whole_word_masking %}
|
83 |
+
if tokenizer.is_fast:
|
84 |
+
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
|
85 |
+
{% endif %}
|
86 |
+
{% endif %}
|
87 |
+
return result
|
88 |
+
|
89 |
+
with accelerator.main_process_first():
|
90 |
+
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=list(set(sum(list(datasets.column_names.values()),[]))), desc="Running tokenizer on dataset"
|
91 |
+
)
|
92 |
+
|
93 |
+
block_size = {{ block_size }}
|
94 |
+
|
95 |
+
def group_texts(examples):
|
96 |
+
# Concatenate all texts.
|
97 |
+
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
98 |
+
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
99 |
+
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
100 |
+
# customize this part to your needs.
|
101 |
+
total_length = (total_length // block_size) * block_size
|
102 |
+
# Split by chunks of max_len.
|
103 |
+
result = {
|
104 |
+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
105 |
+
for k, t in concatenated_examples.items()
|
106 |
+
}
|
107 |
+
result["labels"] = result["input_ids"].copy()
|
108 |
+
return result
|
109 |
+
|
110 |
+
with accelerator.main_process_first():
|
111 |
+
lm_datasets = tokenized_datasets.map(
|
112 |
+
group_texts,
|
113 |
+
batched=True,
|
114 |
+
batch_size=1000,
|
115 |
+
num_proc=4,
|
116 |
+
desc=f"Grouping texts in chunks of {block_size}",
|
117 |
+
)
|
118 |
+
|
119 |
+
{% if whole_word_masking %}
|
120 |
+
def whole_word_masking_data_collator(features):
|
121 |
+
for feature in features:
|
122 |
+
word_ids = feature.pop("word_ids")
|
123 |
+
|
124 |
+
# Create a map between words and corresponding token indices
|
125 |
+
mapping = collections.defaultdict(list)
|
126 |
+
current_word_index = -1
|
127 |
+
current_word = None
|
128 |
+
for idx, word_id in enumerate(word_ids):
|
129 |
+
if word_id is not None:
|
130 |
+
if word_id != current_word:
|
131 |
+
current_word = word_id
|
132 |
+
current_word_index += 1
|
133 |
+
mapping[current_word_index].append(idx)
|
134 |
+
|
135 |
+
# Randomly mask words
|
136 |
+
wwm_probability = {{ mlm_probability }}
|
137 |
+
mask = np.random.binomial(1, wwm_probability, (len(mapping),))
|
138 |
+
input_ids = feature["input_ids"]
|
139 |
+
labels = feature["labels"]
|
140 |
+
new_labels = [-100] * len(labels)
|
141 |
+
for word_id in np.where(mask)[0]:
|
142 |
+
word_id = word_id.item()
|
143 |
+
for idx in mapping[word_id]:
|
144 |
+
new_labels[idx] = labels[idx]
|
145 |
+
input_ids[idx] = tokenizer.mask_token_id
|
146 |
+
|
147 |
+
return default_data_collator(features)
|
148 |
+
|
149 |
+
data_collator = whole_word_masking_data_collator
|
150 |
+
{% else %}
|
151 |
+
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability={{ mlm_probability }})
|
152 |
+
{% endif %}
|
153 |
+
|
154 |
+
def insert_random_mask(batch):
|
155 |
+
features = [dict(zip(batch, t)) for t in zip(*batch.values())]
|
156 |
+
masked_inputs = data_collator(features)
|
157 |
+
# Create a new "masked" column for each column in the dataset
|
158 |
+
return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}
|
159 |
+
|
160 |
+
{% if whole_word_masking %}
|
161 |
+
lm_datasetst = lm_datasets.remove_columns(["word_ids"])
|
162 |
+
{% endif %}
|
163 |
+
with accelerator.main_process_first():
|
164 |
+
eval_dataset = lm_datasets["{{ validation }}"].map(
|
165 |
+
insert_random_mask,
|
166 |
+
batched=True,
|
167 |
+
remove_columns=lm_datasets["{{ validation }}"].column_names,
|
168 |
+
desc="Inserting a random mask on eval dataset"
|
169 |
+
)
|
170 |
+
|
171 |
+
eval_dataset = eval_dataset.rename_columns(
|
172 |
+
{
|
173 |
+
name: name.split('masked_')[1] for name in eval_dataset.features.keys()
|
174 |
+
}
|
175 |
+
)
|
176 |
+
|
177 |
+
|
178 |
+
batch_size = {{ batch_size }}
|
179 |
+
train_dataloader = DataLoader(
|
180 |
+
lm_datasets["{{ train }}"],
|
181 |
+
shuffle=True,
|
182 |
+
batch_size=batch_size,
|
183 |
+
collate_fn=data_collator,
|
184 |
+
)
|
185 |
+
eval_dataloader = DataLoader(
|
186 |
+
eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
|
187 |
+
)
|
188 |
+
|
189 |
+
{{ header("Training") }}
|
190 |
+
|
191 |
+
{% if use_weight_decay %}
|
192 |
+
weight_decay = {{ weight_decay }}
|
193 |
+
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
|
194 |
+
params_with_wd, params_without_wd = [], []
|
195 |
+
for n, p in model.named_parameters():
|
196 |
+
if any(nd in n for nd in no_decay):
|
197 |
+
params_without_wd.append(p)
|
198 |
+
else:
|
199 |
+
params_with_wd.append(p)
|
200 |
+
return [
|
201 |
+
{"params": params_with_wd, "weight_decay": weight_decay},
|
202 |
+
{"params": params_without_wd, "weight_decay": 0.0},
|
203 |
+
]
|
204 |
+
|
205 |
+
optimizer = {{ optimizer }}(get_grouped_params(model), lr={{ lr }})
|
206 |
+
{% else %}
|
207 |
+
optimizer = {{ optimizer }}(model.parameters(), lr={{ lr }})
|
208 |
+
{% endif %}
|
209 |
+
|
210 |
+
accelerator = Accelerator()
|
211 |
+
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
212 |
+
model, optimizer, train_dataloader, eval_dataloader
|
213 |
+
)
|
214 |
+
|
215 |
+
num_train_epochs = {{ num_epochs }}
|
216 |
+
gradient_accumulation_steps = {{ gradient_accumulation_steps }}
|
217 |
+
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
|
218 |
+
max_train_steps = num_train_epochs * num_update_steps_per_epoch
|
219 |
+
output_dir=f"{model_name}-finetuned"
|
220 |
+
|
221 |
+
lr_scheduler = get_scheduler(
|
222 |
+
'{{ lr_scheduler_type }}',
|
223 |
+
optimizer=optimizer,
|
224 |
+
num_warmup_steps={{ num_warmup_steps }},
|
225 |
+
num_training_steps=max_train_steps,
|
226 |
+
)
|
227 |
+
|
228 |
+
progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
|
229 |
+
for epoch in range(num_train_epochs):
|
230 |
+
# Training
|
231 |
+
model.train()
|
232 |
+
for step, batch in enumerate(train_dataloader):
|
233 |
+
outputs = model(**batch)
|
234 |
+
loss = outputs.loss / gradient_accumulation_steps
|
235 |
+
accelerator.backward(loss)
|
236 |
+
|
237 |
+
if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
238 |
+
#TODO Let the user decide on clip grad norm
|
239 |
+
accelerator.clip_grad_norm_(model.parameters(), 1.0)
|
240 |
+
optimizer.step()
|
241 |
+
lr_scheduler.step()
|
242 |
+
optimizer.zero_grad()
|
243 |
+
progress_bar.update(1)
|
244 |
+
|
245 |
+
# Evaluation
|
246 |
+
model.eval()
|
247 |
+
losses = []
|
248 |
+
for step, batch in enumerate(eval_dataloader):
|
249 |
+
with torch.no_grad():
|
250 |
+
outputs = model(**batch)
|
251 |
+
|
252 |
+
loss = outputs.loss
|
253 |
+
losses.append(accelerator.gather(loss.repeat(batch_size)))
|
254 |
+
|
255 |
+
losses = torch.cat(losses)
|
256 |
+
losses = losses[: len(eval_dataset)]
|
257 |
+
try:
|
258 |
+
eval_loss = torch.mean(losses)
|
259 |
+
perplexity = math.exp(eval_loss)
|
260 |
+
except OverflowError:
|
261 |
+
perplexity = float("inf")
|
262 |
+
accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
|
263 |
+
model.train()
|
264 |
+
accelerator.wait_for_everyone()
|
265 |
+
unwrapped_model = accelerator.unwrap_model(model)
|
266 |
+
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
|
267 |
+
if accelerator.is_main_process:
|
268 |
+
tokenizer.save_pretrained(output_dir)
|
269 |
+
|
270 |
+
emissions = tracker.stop()
|
271 |
+
accelerator.print(f'Emissions: {emissions} kg')
|
templates/Accelerate/task_templates/text-generation.py.jinja
ADDED
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Before running, install required packages:
|
2 |
+
{% if notebook %}
|
3 |
+
|
4 |
+
!
|
5 |
+
{%- else %}
|
6 |
+
#
|
7 |
+
{%- endif %}
|
8 |
+
pip install datasets transformers[sentencepiece] accelerate sacrebleu==1.4.14 codecarbon sacremoses
|
9 |
+
|
10 |
+
import collections
|
11 |
+
import logging
|
12 |
+
import math
|
13 |
+
import random
|
14 |
+
|
15 |
+
import datasets
|
16 |
+
import numpy as np
|
17 |
+
import torch
|
18 |
+
import transformers
|
19 |
+
from accelerate import Accelerator
|
20 |
+
from accelerate.logging import get_logger
|
21 |
+
from accelerate.utils import set_seed
|
22 |
+
from codecarbon import EmissionsTracker
|
23 |
+
from datasets import load_dataset
|
24 |
+
from torch.optim import {{ optimizer }}
|
25 |
+
from torch.utils.data import DataLoader
|
26 |
+
from torch.utils.data.dataloader import DataLoader
|
27 |
+
from tqdm.auto import tqdm
|
28 |
+
from transformers import (AutoConfig, AutoModelForCausalLM, AutoModelForMaskedLM, AutoTokenizer,
|
29 |
+
DataCollatorForLanguageModeling, Trainer,
|
30 |
+
TrainingArguments, default_data_collator,
|
31 |
+
get_scheduler)
|
32 |
+
from transformers.utils.versions import require_version
|
33 |
+
|
34 |
+
{{ header("Setup") }}
|
35 |
+
|
36 |
+
tracker = EmissionsTracker(log_level='error')
|
37 |
+
tracker.start()
|
38 |
+
|
39 |
+
logger = get_logger(__name__)
|
40 |
+
require_version("datasets>=1.8.0")
|
41 |
+
|
42 |
+
accelerator = Accelerator()
|
43 |
+
set_seed({{ seed }})
|
44 |
+
|
45 |
+
logging.basicConfig(
|
46 |
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
47 |
+
datefmt="%m/%d/%Y %H:%M:%S",
|
48 |
+
level=logging.ERROR,
|
49 |
+
)
|
50 |
+
logger.info(accelerator.state, main_process_only=False)
|
51 |
+
if accelerator.is_local_main_process:
|
52 |
+
datasets.utils.logging.set_verbosity_warning()
|
53 |
+
transformers.utils.logging.set_verbosity_info()
|
54 |
+
else:
|
55 |
+
datasets.utils.logging.set_verbosity_error()
|
56 |
+
transformers.utils.logging.set_verbosity_error()
|
57 |
+
|
58 |
+
{{ header("Load model and dataset") }}
|
59 |
+
|
60 |
+
{% if subset == 'default' %}
|
61 |
+
datasets = load_dataset('{{dataset}}')
|
62 |
+
{% else %}
|
63 |
+
datasets = load_dataset('{{dataset}}', '{{ subset }}')
|
64 |
+
{% endif %}
|
65 |
+
model_checkpoint = "{{model_checkpoint}}"
|
66 |
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
|
67 |
+
{% if pretrained %}
|
68 |
+
model = AutoModelFor{{task}}.from_pretrained(model_checkpoint)
|
69 |
+
{% else %}
|
70 |
+
config = AutoConfig.from_pretrained(model_checkpoint)
|
71 |
+
model = AutoModelFor{{task}}.from_config(config)
|
72 |
+
{% endif %}
|
73 |
+
model.resize_token_embeddings(len(tokenizer))
|
74 |
+
model_name = model_checkpoint.split("/")[-1]
|
75 |
+
|
76 |
+
if tokenizer.pad_token is None:
|
77 |
+
tokenizer.pad_token = tokenizer.eos_token
|
78 |
+
|
79 |
+
{{ header("Preprocessing") }}
|
80 |
+
|
81 |
+
def tokenize_function(examples):
|
82 |
+
result = tokenizer(examples["{{ feature }}"])
|
83 |
+
{% if task=="MaskedLM" %}
|
84 |
+
{% if whole_word_masking %}
|
85 |
+
if tokenizer.is_fast:
|
86 |
+
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
|
87 |
+
{% endif %}
|
88 |
+
{% endif %}
|
89 |
+
return result
|
90 |
+
|
91 |
+
with accelerator.main_process_first():
|
92 |
+
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=list(set(sum(list(datasets.column_names.values()),[]))), desc="Running tokenizer on dataset"
|
93 |
+
)
|
94 |
+
|
95 |
+
block_size = {{ block_size }}
|
96 |
+
|
97 |
+
def group_texts(examples):
|
98 |
+
# Concatenate all texts.
|
99 |
+
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
100 |
+
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
101 |
+
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
102 |
+
# customize this part to your needs.
|
103 |
+
total_length = (total_length // block_size) * block_size
|
104 |
+
# Split by chunks of max_len.
|
105 |
+
result = {
|
106 |
+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
107 |
+
for k, t in concatenated_examples.items()
|
108 |
+
}
|
109 |
+
result["labels"] = result["input_ids"].copy()
|
110 |
+
return result
|
111 |
+
|
112 |
+
with accelerator.main_process_first():
|
113 |
+
lm_datasets = tokenized_datasets.map(
|
114 |
+
group_texts,
|
115 |
+
batched=True,
|
116 |
+
batch_size=1000,
|
117 |
+
num_proc=4,
|
118 |
+
desc=f"Grouping texts in chunks of {block_size}",
|
119 |
+
)
|
120 |
+
|
121 |
+
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
|
122 |
+
batch_size = {{ batch_size }}
|
123 |
+
train_dataloader = DataLoader(lm_datasets["{{ train }}"], batch_size=batch_size, shuffle=True, collate_fn=data_collator)
|
124 |
+
eval_dataloader = DataLoader(lm_datasets["{{ validation }}"], batch_size=batch_size, collate_fn=data_collator)
|
125 |
+
|
126 |
+
{{ header("Training") }}
|
127 |
+
|
128 |
+
{% if use_weight_decay %}
|
129 |
+
weight_decay = {{ weight_decay }}
|
130 |
+
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
|
131 |
+
params_with_wd, params_without_wd = [], []
|
132 |
+
for n, p in model.named_parameters():
|
133 |
+
if any(nd in n for nd in no_decay):
|
134 |
+
params_without_wd.append(p)
|
135 |
+
else:
|
136 |
+
params_with_wd.append(p)
|
137 |
+
return [
|
138 |
+
{"params": params_with_wd, "weight_decay": weight_decay},
|
139 |
+
{"params": params_without_wd, "weight_decay": 0.0},
|
140 |
+
]
|
141 |
+
|
142 |
+
optimizer = {{ optimizer }}(get_grouped_params(model), lr={{ lr }})
|
143 |
+
{% else %}
|
144 |
+
optimizer = {{ optimizer }}(model.parameters(), lr={{ lr }})
|
145 |
+
{% endif %}
|
146 |
+
|
147 |
+
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
148 |
+
model, optimizer, train_dataloader, eval_dataloader
|
149 |
+
)
|
150 |
+
|
151 |
+
num_train_epochs = {{ num_epochs }}
|
152 |
+
gradient_accumulation_steps = {{ gradient_accumulation_steps }}
|
153 |
+
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
|
154 |
+
max_train_steps = num_train_epochs * num_update_steps_per_epoch
|
155 |
+
output_dir=f"{model_name}-finetuned"
|
156 |
+
|
157 |
+
lr_scheduler = get_scheduler(
|
158 |
+
'{{ lr_scheduler_type }}',
|
159 |
+
optimizer=optimizer,
|
160 |
+
num_warmup_steps={{ num_warmup_steps }},
|
161 |
+
num_training_steps=max_train_steps,
|
162 |
+
)
|
163 |
+
|
164 |
+
progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
|
165 |
+
for epoch in range(num_train_epochs):
|
166 |
+
# Training
|
167 |
+
model.train()
|
168 |
+
for step, batch in enumerate(train_dataloader):
|
169 |
+
outputs = model(**batch)
|
170 |
+
loss = outputs.loss / gradient_accumulation_steps
|
171 |
+
accelerator.backward(loss)
|
172 |
+
if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
173 |
+
#TODO Let the user decide on clip grad norm
|
174 |
+
accelerator.clip_grad_norm_(model.parameters(), 1.0)
|
175 |
+
optimizer.step()
|
176 |
+
lr_scheduler.step()
|
177 |
+
optimizer.zero_grad()
|
178 |
+
progress_bar.update(1)
|
179 |
+
|
180 |
+
# Evaluation
|
181 |
+
model.eval()
|
182 |
+
losses = []
|
183 |
+
for step, batch in enumerate(eval_dataloader):
|
184 |
+
with torch.no_grad():
|
185 |
+
outputs = model(**batch)
|
186 |
+
|
187 |
+
loss = outputs.loss
|
188 |
+
losses.append(accelerator.gather(loss.repeat(batch_size)))
|
189 |
+
|
190 |
+
losses = torch.cat(losses)
|
191 |
+
losses = losses[: len(eval_dataloader.dataset)]
|
192 |
+
try:
|
193 |
+
eval_loss = torch.mean(losses)
|
194 |
+
perplexity = math.exp(eval_loss)
|
195 |
+
except OverflowError:
|
196 |
+
perplexity = float("inf")
|
197 |
+
|
198 |
+
accelerator.print(f"epoch {epoch}: perplexity: {perplexity} eval_loss: {eval_loss}")
|
199 |
+
model.train()
|
200 |
+
accelerator.wait_for_everyone()
|
201 |
+
unwrapped_model = accelerator.unwrap_model(model)
|
202 |
+
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
|
203 |
+
if accelerator.is_main_process:
|
204 |
+
tokenizer.save_pretrained(output_dir)
|
205 |
+
|
206 |
+
emissions = tracker.stop()
|
207 |
+
accelerator.print(f'Emissions: {emissions} kg')
|
templates/Accelerate/task_templates/translation.py.jinja
ADDED
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Before running, install required packages:
|
2 |
+
{% if notebook %}
|
3 |
+
|
4 |
+
!
|
5 |
+
{%- else %}
|
6 |
+
#
|
7 |
+
{%- endif %}
|
8 |
+
pip install datasets transformers[sentencepiece] accelerate sacrebleu==1.4.14 codecarbon sacremoses
|
9 |
+
|
10 |
+
import collections
|
11 |
+
import logging
|
12 |
+
import math
|
13 |
+
import random
|
14 |
+
|
15 |
+
import babel
|
16 |
+
import datasets
|
17 |
+
import numpy as np
|
18 |
+
import torch
|
19 |
+
import transformers
|
20 |
+
from accelerate import Accelerator
|
21 |
+
from accelerate.logging import get_logger
|
22 |
+
from accelerate.utils import set_seed
|
23 |
+
from codecarbon import EmissionsTracker
|
24 |
+
from datasets import load_dataset, load_metric
|
25 |
+
from torch.optim import {{ optimizer }}
|
26 |
+
from torch.utils.data import DataLoader
|
27 |
+
from torch.utils.data.dataloader import DataLoader
|
28 |
+
from tqdm.auto import tqdm
|
29 |
+
from transformers import (AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer,
|
30 |
+
DataCollatorForLanguageModeling,
|
31 |
+
DataCollatorForSeq2Seq, MBartTokenizer,
|
32 |
+
MBartTokenizerFast, Trainer, TrainingArguments,
|
33 |
+
default_data_collator, get_scheduler)
|
34 |
+
from transformers.utils.versions import require_version
|
35 |
+
|
36 |
+
{{ header("Setup") }}
|
37 |
+
|
38 |
+
tracker = EmissionsTracker(log_level='error')
|
39 |
+
tracker.start()
|
40 |
+
|
41 |
+
logger = get_logger(__name__)
|
42 |
+
require_version("datasets>=1.8.0")
|
43 |
+
|
44 |
+
accelerator = Accelerator()
|
45 |
+
set_seed({{ seed }})
|
46 |
+
|
47 |
+
logging.basicConfig(
|
48 |
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
49 |
+
datefmt="%m/%d/%Y %H:%M:%S",
|
50 |
+
level=logging.ERROR,
|
51 |
+
)
|
52 |
+
logger.info(accelerator.state, main_process_only=False)
|
53 |
+
if accelerator.is_local_main_process:
|
54 |
+
datasets.utils.logging.set_verbosity_warning()
|
55 |
+
transformers.utils.logging.set_verbosity_info()
|
56 |
+
else:
|
57 |
+
datasets.utils.logging.set_verbosity_error()
|
58 |
+
transformers.utils.logging.set_verbosity_error()
|
59 |
+
|
60 |
+
{{ header("Load model and dataset") }}
|
61 |
+
|
62 |
+
{% if subset == 'default' %}
|
63 |
+
datasets = load_dataset('{{dataset}}')
|
64 |
+
{% else %}
|
65 |
+
datasets = load_dataset('{{dataset}}', '{{ subset }}')
|
66 |
+
{% endif %}
|
67 |
+
metric = load_metric("sacrebleu")
|
68 |
+
model_checkpoint = "{{model_checkpoint}}"
|
69 |
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
|
70 |
+
{% if pretrained %}
|
71 |
+
model = AutoModelFor{{task}}.from_pretrained(model_checkpoint)
|
72 |
+
{% else %}
|
73 |
+
config = AutoConfig.from_pretrained(model_checkpoint)
|
74 |
+
model = AutoModelFor{{task}}.from_config(config)
|
75 |
+
{% endif %}
|
76 |
+
model.resize_token_embeddings(len(tokenizer))
|
77 |
+
model_name = model_checkpoint.split("/")[-1]
|
78 |
+
|
79 |
+
{{ header("Preprocessing") }}
|
80 |
+
|
81 |
+
source_lang = '{{ source_language }}'
|
82 |
+
target_lang = '{{ target_language }}'
|
83 |
+
{% if 'mbart' in model_checkpoint %}
|
84 |
+
|
85 |
+
# Set decoder_start_token_id
|
86 |
+
if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
|
87 |
+
assert (
|
88 |
+
target_lang is not None and source_lang is not None
|
89 |
+
), "mBart requires --target_lang and --source_lang"
|
90 |
+
if isinstance(tokenizer, MBartTokenizer):
|
91 |
+
model.config.decoder_start_token_id = tokenizer.lang_code_to_id[target_lang]
|
92 |
+
else:
|
93 |
+
model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(target_lang)
|
94 |
+
|
95 |
+
{% endif %}
|
96 |
+
{% if 't5' in model_checkpoint %}
|
97 |
+
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
|
98 |
+
for language in (source_lang, target_lang):
|
99 |
+
if language != language[:2]:
|
100 |
+
logging.warning(
|
101 |
+
'Extended language code %s not supported. Falling back on %s.',
|
102 |
+
language, language[:2]
|
103 |
+
)
|
104 |
+
lang_id_to_string = {
|
105 |
+
source_lang: babel.Locale(source_lang[:2]).english_name,
|
106 |
+
target_lang: babel.Locale(target_lang[:2]).english_name,
|
107 |
+
}
|
108 |
+
src_str = 'translate {}'.format(lang_id_to_string[source_lang])
|
109 |
+
tgt_str = ' to {}: '.format(lang_id_to_string[target_lang])
|
110 |
+
prefix = src_str + tgt_str
|
111 |
+
else:
|
112 |
+
prefix = ""
|
113 |
+
{% else %}
|
114 |
+
prefix = ""
|
115 |
+
{% endif %}
|
116 |
+
{% if 'mbart' in model_checkpoint %}
|
117 |
+
|
118 |
+
# For translation we set the codes of our source and target languages (only useful for mBART, the others will
|
119 |
+
# ignore those attributes).
|
120 |
+
if isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
|
121 |
+
label = ['ar_AR', 'cs_CZ', 'de_DE', 'en_XX', 'es_XX', 'et_EE', 'fi_FI', 'fr_XX', 'gu_IN', 'hi_IN', 'it_IT', 'ja_XX', 'kk_KZ', 'ko_KR', 'lt_LT', 'lv_LV', 'my_MM', 'ne_NP', 'nl_XX', 'ro_RO', 'ru_RU', 'si_LK', 'tr_TR', 'vi_VN', 'zh_CN']
|
122 |
+
source_code = [item for item in label if item.startswith(source_lang)][0]
|
123 |
+
target_code = [item for item in label if item.startswith(target_lang)][0]
|
124 |
+
if source_lang is not None:
|
125 |
+
tokenizer.src_lang = source_code
|
126 |
+
if target_lang is not None:
|
127 |
+
tokenizer.tgt_lang = target_code
|
128 |
+
{% endif %}
|
129 |
+
max_input_length = {{ block_size }}
|
130 |
+
max_target_length = {{ block_size }}
|
131 |
+
|
132 |
+
def preprocess_function(examples):
|
133 |
+
inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
|
134 |
+
targets = [ex[target_lang] for ex in examples["translation"]]
|
135 |
+
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
|
136 |
+
|
137 |
+
# Setup the tokenizer for targets
|
138 |
+
with tokenizer.as_target_tokenizer():
|
139 |
+
labels = tokenizer(targets, max_length=max_target_length, truncation=True)
|
140 |
+
|
141 |
+
model_inputs["labels"] = labels["input_ids"]
|
142 |
+
return model_inputs
|
143 |
+
|
144 |
+
with accelerator.main_process_first():
|
145 |
+
tokenized_datasets = datasets.map(preprocess_function, batched=True, num_proc=4, remove_columns=list(
|
146 |
+
set(sum(list(datasets.column_names.values()), []))), desc="Running tokenizer on dataset")
|
147 |
+
|
148 |
+
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, pad_to_multiple_of=8 if accelerator.use_fp16 else None)
|
149 |
+
batch_size = {{ batch_size }}
|
150 |
+
train_dataloader = DataLoader(tokenized_datasets["{{ train }}"], batch_size=batch_size, shuffle=True, collate_fn=data_collator)
|
151 |
+
eval_dataloader = DataLoader(tokenized_datasets["{{ validation }}"], batch_size=batch_size, collate_fn=data_collator)
|
152 |
+
|
153 |
+
{{ header("Training") }}
|
154 |
+
|
155 |
+
def compute_metrics(eval_preds):
|
156 |
+
preds, labels = eval_preds
|
157 |
+
# In case the model returns more than the prediction logits
|
158 |
+
if isinstance(preds, tuple):
|
159 |
+
preds = preds[0]
|
160 |
+
|
161 |
+
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
|
162 |
+
|
163 |
+
# Replace -100s in the labels as we can't decode them
|
164 |
+
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
165 |
+
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
166 |
+
|
167 |
+
# Some simple post-processing
|
168 |
+
decoded_preds = [pred.strip() for pred in decoded_preds]
|
169 |
+
decoded_labels = [[label.strip()] for label in decoded_labels]
|
170 |
+
|
171 |
+
result = metric.compute(predictions=decoded_preds,
|
172 |
+
references=decoded_labels)
|
173 |
+
return {"bleu": result["score"]}
|
174 |
+
|
175 |
+
|
176 |
+
def postprocess(predictions, labels):
|
177 |
+
predictions = predictions.cpu().numpy()
|
178 |
+
labels = labels.cpu().numpy()
|
179 |
+
|
180 |
+
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
|
181 |
+
|
182 |
+
# Replace -100 in the labels as we can't decode them.
|
183 |
+
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
184 |
+
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
185 |
+
|
186 |
+
# Some simple post-processing
|
187 |
+
decoded_preds = [pred.strip() for pred in decoded_preds]
|
188 |
+
decoded_labels = [[label.strip()] for label in decoded_labels]
|
189 |
+
return decoded_preds, decoded_labels
|
190 |
+
|
191 |
+
{% if use_weight_decay %}
|
192 |
+
weight_decay = {{ weight_decay }}
|
193 |
+
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
|
194 |
+
params_with_wd, params_without_wd = [], []
|
195 |
+
for n, p in model.named_parameters():
|
196 |
+
if any(nd in n for nd in no_decay):
|
197 |
+
params_without_wd.append(p)
|
198 |
+
else:
|
199 |
+
params_with_wd.append(p)
|
200 |
+
return [
|
201 |
+
{"params": params_with_wd, "weight_decay": weight_decay},
|
202 |
+
{"params": params_without_wd, "weight_decay": 0.0},
|
203 |
+
]
|
204 |
+
|
205 |
+
optimizer = {{ optimizer }}(get_grouped_params(model), lr={{ lr }})
|
206 |
+
{% else %}
|
207 |
+
optimizer = {{ optimizer }}(model.parameters(), lr={{ lr }})
|
208 |
+
{% endif %}
|
209 |
+
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
210 |
+
model, optimizer, train_dataloader, eval_dataloader
|
211 |
+
)
|
212 |
+
|
213 |
+
num_train_epochs = {{ num_epochs }}
|
214 |
+
gradient_accumulation_steps = {{ gradient_accumulation_steps }}
|
215 |
+
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
|
216 |
+
max_train_steps = num_train_epochs * num_update_steps_per_epoch
|
217 |
+
output_dir=f"{model_name}-finetuned"
|
218 |
+
|
219 |
+
lr_scheduler = get_scheduler(
|
220 |
+
'{{ lr_scheduler_type }}',
|
221 |
+
optimizer=optimizer,
|
222 |
+
num_warmup_steps={{ num_warmup_steps }},
|
223 |
+
num_training_steps=max_train_steps,
|
224 |
+
)
|
225 |
+
|
226 |
+
progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
|
227 |
+
for epoch in range(num_train_epochs):
|
228 |
+
# Training
|
229 |
+
model.train()
|
230 |
+
for step, batch in enumerate(train_dataloader):
|
231 |
+
outputs = model(**batch)
|
232 |
+
loss = outputs.loss
|
233 |
+
loss = loss / gradient_accumulation_steps
|
234 |
+
accelerator.backward(loss)
|
235 |
+
|
236 |
+
if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
237 |
+
optimizer.step()
|
238 |
+
lr_scheduler.step()
|
239 |
+
optimizer.zero_grad()
|
240 |
+
progress_bar.update(1)
|
241 |
+
|
242 |
+
|
243 |
+
# Evaluation
|
244 |
+
model.eval()
|
245 |
+
samples_seen = 0
|
246 |
+
for step, batch in enumerate(eval_dataloader):
|
247 |
+
with torch.no_grad():
|
248 |
+
generated_tokens = accelerator.unwrap_model(model).generate(
|
249 |
+
batch["input_ids"],
|
250 |
+
attention_mask=batch["attention_mask"],
|
251 |
+
max_length=128,
|
252 |
+
)
|
253 |
+
labels = batch["labels"]
|
254 |
+
|
255 |
+
# Necessary to pad predictions and labels for being gathered
|
256 |
+
generated_tokens = accelerator.pad_across_processes(
|
257 |
+
generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
|
258 |
+
)
|
259 |
+
labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
|
260 |
+
|
261 |
+
predictions_gathered = accelerator.gather(generated_tokens)
|
262 |
+
labels_gathered = accelerator.gather(labels)
|
263 |
+
|
264 |
+
decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
|
265 |
+
|
266 |
+
if accelerator.num_processes > 1:
|
267 |
+
if step == len(eval_dataloader) - 1:
|
268 |
+
decoded_preds = decoded_preds[: len(
|
269 |
+
eval_dataloader.dataset) - samples_seen]
|
270 |
+
decoded_labels = decoded_labels[: len(
|
271 |
+
eval_dataloader.dataset) - samples_seen]
|
272 |
+
else:
|
273 |
+
samples_seen += len(decoded_labels)
|
274 |
+
metric.add_batch(predictions=decoded_preds, references=decoded_labels)
|
275 |
+
|
276 |
+
results = metric.compute()
|
277 |
+
print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")
|
278 |
+
|
279 |
+
# Save and upload
|
280 |
+
accelerator.wait_for_everyone()
|
281 |
+
unwrapped_model = accelerator.unwrap_model(model)
|
282 |
+
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
|
283 |
+
if accelerator.is_main_process:
|
284 |
+
tokenizer.save_pretrained(output_dir)
|
285 |
+
|
286 |
+
emissions = tracker.stop()
|
287 |
+
print(f'Emissions: {emissions} kg')
|
templates/Trainer/task_templates/fill-mask.py.jinja
ADDED
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Before running, install required packages:
|
2 |
+
{% if notebook %}
|
3 |
+
|
4 |
+
!
|
5 |
+
{%- else %}
|
6 |
+
#
|
7 |
+
{%- endif %}
|
8 |
+
pip install datasets transformers
|
9 |
+
|
10 |
+
import collections
|
11 |
+
import math
|
12 |
+
import logging
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import transformers
|
16 |
+
import datasets
|
17 |
+
from datasets import load_dataset
|
18 |
+
from transformers import (AutoConfig, AutoModelForMaskedLM, AutoTokenizer,
|
19 |
+
DataCollatorForLanguageModeling, Trainer,
|
20 |
+
TrainingArguments, default_data_collator, set_seed)
|
21 |
+
from transformers.testing_utils import CaptureLogger
|
22 |
+
from transformers.utils.versions import require_version
|
23 |
+
|
24 |
+
{{ header("Setup") }}
|
25 |
+
|
26 |
+
|
27 |
+
logger = logging.getLogger(__name__)
|
28 |
+
require_version("datasets>=1.8.0")
|
29 |
+
set_seed({{ seed }})
|
30 |
+
logging.basicConfig(
|
31 |
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
32 |
+
datefmt="%m/%d/%Y %H:%M:%S",
|
33 |
+
level=logging.ERROR,
|
34 |
+
)
|
35 |
+
datasets.utils.logging.set_verbosity_warning()
|
36 |
+
transformers.utils.logging.set_verbosity_info()
|
37 |
+
|
38 |
+
|
39 |
+
{{ header("Load model and dataset") }}
|
40 |
+
|
41 |
+
{% if subset == 'default' %}
|
42 |
+
datasets = load_dataset('{{dataset}}')
|
43 |
+
{% else %}
|
44 |
+
datasets = load_dataset('{{dataset}}', '{{ subset }}')
|
45 |
+
{% endif %}
|
46 |
+
model_checkpoint = "{{model_checkpoint}}"
|
47 |
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
|
48 |
+
{% if pretrained %}
|
49 |
+
model = AutoModelFor{{task}}.from_pretrained(model_checkpoint)
|
50 |
+
{% else %}
|
51 |
+
config = AutoConfig.from_pretrained(model_checkpoint)
|
52 |
+
model = AutoModelFor{{task}}.from_config(config)
|
53 |
+
{% endif %}
|
54 |
+
model.resize_token_embeddings(len(tokenizer))
|
55 |
+
model_name = model_checkpoint.split("/")[-1]
|
56 |
+
|
57 |
+
if tokenizer.pad_token is None:
|
58 |
+
tokenizer.pad_token = tokenizer.eos_token
|
59 |
+
|
60 |
+
{{ header("Preprocessing") }}
|
61 |
+
|
62 |
+
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
|
63 |
+
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
|
64 |
+
def tokenize_function(examples):
|
65 |
+
with CaptureLogger(tok_logger) as cl:
|
66 |
+
result = tokenizer(examples["{{ feature }}"])
|
67 |
+
if "Token indices sequence length is longer than the" in cl.out:
|
68 |
+
tok_logger.warning(
|
69 |
+
"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
|
70 |
+
" before being passed to the model."
|
71 |
+
)
|
72 |
+
if tokenizer.is_fast:
|
73 |
+
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
|
74 |
+
return result
|
75 |
+
|
76 |
+
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=list(set(sum(list(datasets.column_names.values()),[]))), desc="Running tokenizer on dataset"
|
77 |
+
)
|
78 |
+
block_size = {{ block_size }}
|
79 |
+
|
80 |
+
def group_texts(examples):
|
81 |
+
# Concatenate all texts.
|
82 |
+
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
83 |
+
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
84 |
+
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
85 |
+
# customize this part to your needs.
|
86 |
+
total_length = (total_length // block_size) * block_size
|
87 |
+
# Split by chunks of max_len.
|
88 |
+
result = {
|
89 |
+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
90 |
+
for k, t in concatenated_examples.items()
|
91 |
+
}
|
92 |
+
result["labels"] = result["input_ids"].copy()
|
93 |
+
return result
|
94 |
+
|
95 |
+
lm_datasets = tokenized_datasets.map(
|
96 |
+
group_texts,
|
97 |
+
batched=True,
|
98 |
+
batch_size=1000,
|
99 |
+
num_proc=4,
|
100 |
+
desc=f"Grouping texts in chunks of {block_size}",
|
101 |
+
)
|
102 |
+
|
103 |
+
{{ header("Training") }}
|
104 |
+
|
105 |
+
training_args = TrainingArguments(
|
106 |
+
output_dir=f"{model_name}-finetuned",
|
107 |
+
per_device_train_batch_size={{ batch_size }},
|
108 |
+
per_device_eval_batch_size={{ batch_size }},
|
109 |
+
evaluation_strategy='epoch',
|
110 |
+
logging_strategy='epoch',
|
111 |
+
save_strategy='epoch',
|
112 |
+
optim='{{ optimizer }}',
|
113 |
+
learning_rate={{ lr }},
|
114 |
+
num_train_epochs={{ num_epochs }},
|
115 |
+
gradient_accumulation_steps={{ gradient_accumulation_steps }},
|
116 |
+
lr_scheduler_type='{{ lr_scheduler_type }}',
|
117 |
+
warmup_steps={{ num_warmup_steps }},
|
118 |
+
{% if use_weight_decay%}
|
119 |
+
weight_decay={{ weight_decay }},
|
120 |
+
{% endif %}
|
121 |
+
push_to_hub=False,
|
122 |
+
dataloader_num_workers=0,
|
123 |
+
{% if task=="MaskedLM" %}
|
124 |
+
{% if whole_word_masking %}
|
125 |
+
remove_unused_columns=False,
|
126 |
+
{% endif %}
|
127 |
+
{% endif %}
|
128 |
+
load_best_model_at_end=True,
|
129 |
+
log_level='error'
|
130 |
+
)
|
131 |
+
|
132 |
+
|
133 |
+
{% if whole_word_masking %}
|
134 |
+
def whole_word_masking_data_collator(features):
|
135 |
+
for feature in features:
|
136 |
+
word_ids = feature.pop("word_ids")
|
137 |
+
|
138 |
+
# Create a map between words and corresponding token indices
|
139 |
+
mapping = collections.defaultdict(list)
|
140 |
+
current_word_index = -1
|
141 |
+
current_word = None
|
142 |
+
for idx, word_id in enumerate(word_ids):
|
143 |
+
if word_id is not None:
|
144 |
+
if word_id != current_word:
|
145 |
+
current_word = word_id
|
146 |
+
current_word_index += 1
|
147 |
+
mapping[current_word_index].append(idx)
|
148 |
+
|
149 |
+
# Randomly mask words
|
150 |
+
wwm_probability = {{ mlm_probability }}
|
151 |
+
mask = np.random.binomial(1, wwm_probability, (len(mapping),))
|
152 |
+
input_ids = feature["input_ids"]
|
153 |
+
labels = feature["labels"]
|
154 |
+
new_labels = [-100] * len(labels)
|
155 |
+
for word_id in np.where(mask)[0]:
|
156 |
+
word_id = word_id.item()
|
157 |
+
for idx in mapping[word_id]:
|
158 |
+
new_labels[idx] = labels[idx]
|
159 |
+
input_ids[idx] = tokenizer.mask_token_id
|
160 |
+
|
161 |
+
return default_data_collator(features)
|
162 |
+
data_collator = whole_word_masking_data_collator
|
163 |
+
{% else %}
|
164 |
+
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability={{ mlm_probability }})
|
165 |
+
{% endif %}
|
166 |
+
|
167 |
+
|
168 |
+
trainer = Trainer(
|
169 |
+
model=model,
|
170 |
+
args=training_args,
|
171 |
+
train_dataset=lm_datasets["{{ train }}"],
|
172 |
+
eval_dataset=lm_datasets["{{ validation }}"],
|
173 |
+
data_collator=data_collator,
|
174 |
+
)
|
175 |
+
|
176 |
+
train_result = trainer.train()
|
177 |
+
trainer.save_model()
|
178 |
+
trainer.log_metrics("train", train_result.metrics)
|
179 |
+
trainer.save_metrics("train", train_result.metrics)
|
180 |
+
trainer.save_state()
|
181 |
+
eval_results = trainer.evaluate()
|
182 |
+
eval_results["perplexity"] = math.exp(eval_results['eval_loss'])
|
183 |
+
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
|
184 |
+
trainer.log_metrics("eval", eval_results)
|
185 |
+
trainer.save_metrics("eval", eval_results)
|
templates/Trainer/task_templates/text-generation.py.jinja
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Before running, install required packages:
|
2 |
+
{% if notebook %}
|
3 |
+
|
4 |
+
!
|
5 |
+
{%- else %}
|
6 |
+
#
|
7 |
+
{%- endif %}
|
8 |
+
pip install datasets transformers
|
9 |
+
|
10 |
+
import collections
|
11 |
+
import math
|
12 |
+
import logging
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import transformers
|
16 |
+
import datasets
|
17 |
+
from datasets import load_dataset
|
18 |
+
from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
|
19 |
+
DataCollatorForLanguageModeling, Trainer,
|
20 |
+
TrainingArguments, default_data_collator, set_seed)
|
21 |
+
from transformers.testing_utils import CaptureLogger
|
22 |
+
from transformers.utils.versions import require_version
|
23 |
+
|
24 |
+
{{ header("Setup") }}
|
25 |
+
|
26 |
+
|
27 |
+
logger = logging.getLogger(__name__)
|
28 |
+
require_version("datasets>=1.8.0")
|
29 |
+
set_seed({{ seed }})
|
30 |
+
logging.basicConfig(
|
31 |
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
32 |
+
datefmt="%m/%d/%Y %H:%M:%S",
|
33 |
+
level=logging.ERROR,
|
34 |
+
)
|
35 |
+
datasets.utils.logging.set_verbosity_warning()
|
36 |
+
transformers.utils.logging.set_verbosity_info()
|
37 |
+
|
38 |
+
|
39 |
+
{{ header("Load model and dataset") }}
|
40 |
+
|
41 |
+
{% if subset == 'default' %}
|
42 |
+
datasets = load_dataset('{{dataset}}')
|
43 |
+
{% else %}
|
44 |
+
datasets = load_dataset('{{dataset}}', '{{ subset }}')
|
45 |
+
{% endif %}
|
46 |
+
model_checkpoint = "{{model_checkpoint}}"
|
47 |
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
|
48 |
+
{% if pretrained %}
|
49 |
+
model = AutoModelFor{{task}}.from_pretrained(model_checkpoint)
|
50 |
+
{% else %}
|
51 |
+
config = AutoConfig.from_pretrained(model_checkpoint)
|
52 |
+
model = AutoModelFor{{task}}.from_config(config)
|
53 |
+
{% endif %}
|
54 |
+
model.resize_token_embeddings(len(tokenizer))
|
55 |
+
model_name = model_checkpoint.split("/")[-1]
|
56 |
+
|
57 |
+
if tokenizer.pad_token is None:
|
58 |
+
tokenizer.pad_token = tokenizer.eos_token
|
59 |
+
|
60 |
+
{{ header("Preprocessing") }}
|
61 |
+
|
62 |
+
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
|
63 |
+
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
|
64 |
+
def tokenize_function(examples):
|
65 |
+
with CaptureLogger(tok_logger) as cl:
|
66 |
+
result = tokenizer(examples["{{ feature }}"])
|
67 |
+
if "Token indices sequence length is longer than the" in cl.out:
|
68 |
+
tok_logger.warning(
|
69 |
+
"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
|
70 |
+
" before being passed to the model."
|
71 |
+
)
|
72 |
+
if tokenizer.is_fast:
|
73 |
+
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
|
74 |
+
return result
|
75 |
+
|
76 |
+
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=list(set(sum(list(datasets.column_names.values()),[]))), desc="Running tokenizer on dataset"
|
77 |
+
)
|
78 |
+
block_size = {{ block_size }}
|
79 |
+
|
80 |
+
def group_texts(examples):
|
81 |
+
# Concatenate all texts.
|
82 |
+
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
83 |
+
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
84 |
+
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
85 |
+
# customize this part to your needs.
|
86 |
+
total_length = (total_length // block_size) * block_size
|
87 |
+
# Split by chunks of max_len.
|
88 |
+
result = {
|
89 |
+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
90 |
+
for k, t in concatenated_examples.items()
|
91 |
+
}
|
92 |
+
result["labels"] = result["input_ids"].copy()
|
93 |
+
return result
|
94 |
+
|
95 |
+
lm_datasets = tokenized_datasets.map(
|
96 |
+
group_texts,
|
97 |
+
batched=True,
|
98 |
+
batch_size=1000,
|
99 |
+
num_proc=4,
|
100 |
+
desc=f"Grouping texts in chunks of {block_size}",
|
101 |
+
)
|
102 |
+
|
103 |
+
{{ header("Training") }}
|
104 |
+
|
105 |
+
training_args = TrainingArguments(
|
106 |
+
output_dir=f"{model_name}-finetuned",
|
107 |
+
per_device_train_batch_size={{ batch_size }},
|
108 |
+
per_device_eval_batch_size={{ batch_size }},
|
109 |
+
evaluation_strategy='epoch',
|
110 |
+
logging_strategy='epoch',
|
111 |
+
save_strategy='epoch',
|
112 |
+
optim='{{ optimizer }}',
|
113 |
+
learning_rate={{ lr }},
|
114 |
+
num_train_epochs={{ num_epochs }},
|
115 |
+
gradient_accumulation_steps={{ gradient_accumulation_steps }},
|
116 |
+
lr_scheduler_type='{{ lr_scheduler_type }}',
|
117 |
+
warmup_steps={{ num_warmup_steps }},
|
118 |
+
{% if use_weight_decay%}
|
119 |
+
weight_decay={{ weight_decay }},
|
120 |
+
{% endif %}
|
121 |
+
push_to_hub=False,
|
122 |
+
dataloader_num_workers=0,
|
123 |
+
{% if task=="MaskedLM" %}
|
124 |
+
{% if whole_word_masking %}
|
125 |
+
remove_unused_columns=False,
|
126 |
+
{% endif %}
|
127 |
+
{% endif %}
|
128 |
+
load_best_model_at_end=True,
|
129 |
+
log_level='error'
|
130 |
+
)
|
131 |
+
|
132 |
+
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
|
133 |
+
|
134 |
+
|
135 |
+
trainer = Trainer(
|
136 |
+
model=model,
|
137 |
+
args=training_args,
|
138 |
+
train_dataset=lm_datasets["{{ train }}"],
|
139 |
+
eval_dataset=lm_datasets["{{ validation }}"],
|
140 |
+
data_collator=data_collator,
|
141 |
+
)
|
142 |
+
|
143 |
+
train_result = trainer.train()
|
144 |
+
trainer.save_model()
|
145 |
+
trainer.log_metrics("train", train_result.metrics)
|
146 |
+
trainer.save_metrics("train", train_result.metrics)
|
147 |
+
trainer.save_state()
|
148 |
+
eval_results = trainer.evaluate()
|
149 |
+
eval_results["perplexity"] = math.exp(eval_results['eval_loss'])
|
150 |
+
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
|
151 |
+
trainer.log_metrics("eval", eval_results)
|
152 |
+
trainer.save_metrics("eval", eval_results)
|
templates/Trainer/task_templates/translation.py.jinja
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Before running, install required packages:
|
2 |
+
{% if notebook %}
|
3 |
+
|
4 |
+
!
|
5 |
+
{%- else %}
|
6 |
+
#
|
7 |
+
{%- endif %}
|
8 |
+
pip install datasets transformers[sentencepiece] accelerate sacrebleu==1.4.14 sacremoses
|
9 |
+
|
10 |
+
import collections
|
11 |
+
import logging
|
12 |
+
import math
|
13 |
+
import random
|
14 |
+
|
15 |
+
import babel
|
16 |
+
import datasets
|
17 |
+
import numpy as np
|
18 |
+
import torch
|
19 |
+
import transformers
|
20 |
+
from datasets import load_dataset, load_metric
|
21 |
+
from torch.utils.data import DataLoader
|
22 |
+
from torch.utils.data.dataloader import DataLoader
|
23 |
+
from tqdm.auto import tqdm
|
24 |
+
from transformers import (AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer,
|
25 |
+
DataCollatorForLanguageModeling,
|
26 |
+
DataCollatorForSeq2Seq, MBartTokenizer,
|
27 |
+
MBartTokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments,
|
28 |
+
default_data_collator, get_scheduler)
|
29 |
+
from transformers.utils.versions import require_version
|
30 |
+
|
31 |
+
{{ header("Setup") }}
|
32 |
+
|
33 |
+
|
34 |
+
logger = logging.getLogger(__name__)
|
35 |
+
require_version("datasets>=1.8.0")
|
36 |
+
set_seed({{ seed }})
|
37 |
+
logging.basicConfig(
|
38 |
+
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
39 |
+
datefmt="%m/%d/%Y %H:%M:%S",
|
40 |
+
level=logging.ERROR,
|
41 |
+
)
|
42 |
+
datasets.utils.logging.set_verbosity_warning()
|
43 |
+
transformers.utils.logging.set_verbosity_info()
|
44 |
+
|
45 |
+
{{ header("Load model and dataset") }}
|
46 |
+
|
47 |
+
{% if subset == 'default' %}
|
48 |
+
datasets = load_dataset('{{dataset}}')
|
49 |
+
{% else %}
|
50 |
+
datasets = load_dataset('{{dataset}}', '{{ subset }}')
|
51 |
+
{% endif %}
|
52 |
+
metric = load_metric("sacrebleu")
|
53 |
+
model_checkpoint = "{{model_checkpoint}}"
|
54 |
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
|
55 |
+
{% if pretrained %}
|
56 |
+
model = AutoModelFor{{task}}.from_pretrained(model_checkpoint)
|
57 |
+
{% else %}
|
58 |
+
config = AutoConfig.from_pretrained(model_checkpoint)
|
59 |
+
model = AutoModelFor{{task}}.from_config(config)
|
60 |
+
{% endif %}
|
61 |
+
model.resize_token_embeddings(len(tokenizer))
|
62 |
+
model_name = model_checkpoint.split("/")[-1]
|
63 |
+
|
64 |
+
{{ header("Preprocessing") }}
|
65 |
+
|
66 |
+
source_lang = '{{ source_language }}'
|
67 |
+
target_lang = '{{ target_language }}'
|
68 |
+
{% if 'mbart' in model_checkpoint %}
|
69 |
+
|
70 |
+
# Set decoder_start_token_id
|
71 |
+
if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
|
72 |
+
assert (
|
73 |
+
target_lang is not None and source_lang is not None
|
74 |
+
), "mBart requires --target_lang and --source_lang"
|
75 |
+
if isinstance(tokenizer, MBartTokenizer):
|
76 |
+
model.config.decoder_start_token_id = tokenizer.lang_code_to_id[target_lang]
|
77 |
+
else:
|
78 |
+
model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(target_lang)
|
79 |
+
|
80 |
+
{% endif %}
|
81 |
+
{% if 't5' in model_checkpoint %}
|
82 |
+
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
|
83 |
+
for language in (source_lang, target_lang):
|
84 |
+
if language != language[:2]:
|
85 |
+
logging.warning(
|
86 |
+
'Extended language code %s not supported. Falling back on %s.',
|
87 |
+
language, language[:2]
|
88 |
+
)
|
89 |
+
lang_id_to_string = {
|
90 |
+
source_lang: babel.Locale(source_lang[:2]).english_name,
|
91 |
+
target_lang: babel.Locale(target_lang[:2]).english_name,
|
92 |
+
}
|
93 |
+
src_str = 'translate {}'.format(lang_id_to_string[source_lang])
|
94 |
+
tgt_str = ' to {}: '.format(lang_id_to_string[target_lang])
|
95 |
+
prefix = src_str + tgt_str
|
96 |
+
else:
|
97 |
+
prefix = ""
|
98 |
+
{% else %}
|
99 |
+
prefix = ""
|
100 |
+
{% endif %}
|
101 |
+
{% if 'mbart' in model_checkpoint %}
|
102 |
+
|
103 |
+
# For translation we set the codes of our source and target languages (only useful for mBART, the others will
|
104 |
+
# ignore those attributes).
|
105 |
+
if isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
|
106 |
+
label = ['ar_AR', 'cs_CZ', 'de_DE', 'en_XX', 'es_XX', 'et_EE', 'fi_FI', 'fr_XX', 'gu_IN', 'hi_IN', 'it_IT', 'ja_XX', 'kk_KZ', 'ko_KR', 'lt_LT', 'lv_LV', 'my_MM', 'ne_NP', 'nl_XX', 'ro_RO', 'ru_RU', 'si_LK', 'tr_TR', 'vi_VN', 'zh_CN']
|
107 |
+
source_code = [item for item in label if item.startswith(source_lang)][0]
|
108 |
+
target_code = [item for item in label if item.startswith(target_lang)][0]
|
109 |
+
if source_lang is not None:
|
110 |
+
tokenizer.src_lang = source_code
|
111 |
+
if target_lang is not None:
|
112 |
+
tokenizer.tgt_lang = target_code
|
113 |
+
{% endif %}
|
114 |
+
max_input_length = {{ block_size }}
|
115 |
+
max_target_length = {{ block_size }}
|
116 |
+
|
117 |
+
def preprocess_function(examples):
|
118 |
+
inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
|
119 |
+
targets = [ex[target_lang] for ex in examples["translation"]]
|
120 |
+
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
|
121 |
+
|
122 |
+
# Setup the tokenizer for targets
|
123 |
+
with tokenizer.as_target_tokenizer():
|
124 |
+
labels = tokenizer(targets, max_length=max_target_length, truncation=True)
|
125 |
+
|
126 |
+
model_inputs["labels"] = labels["input_ids"]
|
127 |
+
return model_inputs
|
128 |
+
|
129 |
+
|
130 |
+
tokenized_datasets = datasets.map(preprocess_function, batched=True, num_proc=4, remove_columns=list(
|
131 |
+
set(sum(list(datasets.column_names.values()), []))), desc="Running tokenizer on dataset")
|
132 |
+
|
133 |
+
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
134 |
+
batch_size = {{ batch_size }}
|
135 |
+
|
136 |
+
{{ header("Training") }}
|
137 |
+
|
138 |
+
def compute_metrics(eval_preds):
|
139 |
+
preds, labels = eval_preds
|
140 |
+
# In case the model returns more than the prediction logits
|
141 |
+
if isinstance(preds, tuple):
|
142 |
+
preds = preds[0]
|
143 |
+
|
144 |
+
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
|
145 |
+
|
146 |
+
# Replace -100s in the labels as we can't decode them
|
147 |
+
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
148 |
+
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
149 |
+
|
150 |
+
# Some simple post-processing
|
151 |
+
decoded_preds = [pred.strip() for pred in decoded_preds]
|
152 |
+
decoded_labels = [[label.strip()] for label in decoded_labels]
|
153 |
+
|
154 |
+
result = metric.compute(predictions=decoded_preds,
|
155 |
+
references=decoded_labels)
|
156 |
+
return {"bleu": result["score"]}
|
157 |
+
|
158 |
+
|
159 |
+
def postprocess(predictions, labels):
|
160 |
+
predictions = predictions.cpu().numpy()
|
161 |
+
labels = labels.cpu().numpy()
|
162 |
+
|
163 |
+
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
|
164 |
+
|
165 |
+
# Replace -100 in the labels as we can't decode them.
|
166 |
+
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
|
167 |
+
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
|
168 |
+
|
169 |
+
# Some simple post-processing
|
170 |
+
decoded_preds = [pred.strip() for pred in decoded_preds]
|
171 |
+
decoded_labels = [[label.strip()] for label in decoded_labels]
|
172 |
+
return decoded_preds, decoded_labels
|
173 |
+
|
174 |
+
|
175 |
+
|
176 |
+
training_args = Seq2SeqTrainingArguments(
|
177 |
+
output_dir=f"{model_name}-finetuned",
|
178 |
+
per_device_train_batch_size={{ batch_size }},
|
179 |
+
per_device_eval_batch_size={{ batch_size }},
|
180 |
+
evaluation_strategy='epoch',
|
181 |
+
logging_strategy='epoch',
|
182 |
+
save_strategy='epoch',
|
183 |
+
optim='{{ optimizer }}',
|
184 |
+
learning_rate={{ lr }},
|
185 |
+
num_train_epochs={{ num_epochs }},
|
186 |
+
gradient_accumulation_steps={{ gradient_accumulation_steps }},
|
187 |
+
lr_scheduler_type='{{ lr_scheduler_type }}',
|
188 |
+
warmup_steps={{ num_warmup_steps }},
|
189 |
+
{% if use_weight_decay%}
|
190 |
+
weight_decay={{ weight_decay }},
|
191 |
+
{% endif %}
|
192 |
+
push_to_hub=False,
|
193 |
+
dataloader_num_workers=0,
|
194 |
+
{% if task=="MaskedLM" %}
|
195 |
+
{% if whole_word_masking %}
|
196 |
+
remove_unused_columns=False,
|
197 |
+
{% endif %}
|
198 |
+
{% endif %}
|
199 |
+
load_best_model_at_end=True,
|
200 |
+
log_level='error'
|
201 |
+
)
|
202 |
+
|
203 |
+
trainer = Seq2SeqTrainer(
|
204 |
+
model=model,
|
205 |
+
args=training_args,
|
206 |
+
train_dataset=lm_datasets["{{ train }}"],
|
207 |
+
eval_dataset=lm_datasets["{{ validation }}"],
|
208 |
+
data_collator=data_collator,
|
209 |
+
)
|
210 |
+
|
211 |
+
train_result = trainer.train()
|
212 |
+
trainer.save_model()
|
213 |
+
trainer.log_metrics("train", train_result.metrics)
|
214 |
+
trainer.save_metrics("train", train_result.metrics)
|
215 |
+
trainer.save_state()
|
216 |
+
eval_results = trainer.evaluate()
|
217 |
+
trainer.log_metrics("eval", eval_results)
|
218 |
+
trainer.save_metrics("eval", eval_results)
|