Spaces:
Runtime error
Runtime error
# Before running, install required packages: | |
{% if notebook %} | |
! | |
{%- else %} | |
# | |
{%- endif %} | |
pip install datasets transformers[sentencepiece] accelerate | |
import collections | |
import logging | |
import math | |
import datasets | |
import numpy as np | |
import torch | |
import transformers | |
from accelerate import Accelerator | |
from accelerate.logging import get_logger | |
from accelerate.utils import set_seed | |
from codecarbon import EmissionsTracker | |
from datasets import load_dataset | |
from torch.optim import {{ optimizer }} | |
from torch.utils.data import DataLoader | |
from torch.utils.data.dataloader import DataLoader | |
from tqdm.auto import tqdm | |
from transformers import (AutoConfig, AutoModelForMaskedLM, AutoTokenizer, | |
DataCollatorForLanguageModeling, Trainer, | |
TrainingArguments, default_data_collator, | |
get_scheduler) | |
from transformers.utils.versions import require_version | |
{{ header("Setup") }} | |
tracker = EmissionsTracker(log_level='error') | |
tracker.start() | |
logger = get_logger(__name__) | |
require_version("datasets>=1.8.0") | |
accelerator = Accelerator() | |
set_seed({{ seed }}) | |
logging.basicConfig( | |
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", | |
datefmt="%m/%d/%Y %H:%M:%S", | |
level=logging.ERROR, | |
) | |
logger.info(accelerator.state, main_process_only=False) | |
if accelerator.is_local_main_process: | |
datasets.utils.logging.set_verbosity_warning() | |
transformers.utils.logging.set_verbosity_info() | |
else: | |
datasets.utils.logging.set_verbosity_error() | |
transformers.utils.logging.set_verbosity_error() | |
{{ header("Load model and dataset") }} | |
{% if subset == 'default' %} | |
datasets = load_dataset('{{dataset}}') | |
{% else %} | |
datasets = load_dataset('{{dataset}}', '{{ subset }}') | |
{% endif %} | |
model_checkpoint = "{{model_checkpoint}}" | |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True) | |
{% if pretrained %} | |
model = AutoModelFor{{task}}.from_pretrained(model_checkpoint) | |
{% else %} | |
config = AutoConfig.from_pretrained(model_checkpoint) | |
model = AutoModelFor{{task}}.from_config(config) | |
{% endif %} | |
model.resize_token_embeddings(len(tokenizer)) | |
model_name = model_checkpoint.split("/")[-1] | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.eos_token | |
{{ header("Preprocessing") }} | |
def tokenize_function(examples): | |
result = tokenizer(examples["{{ feature }}"]) | |
{% if task=="MaskedLM" %} | |
{% if whole_word_masking %} | |
if tokenizer.is_fast: | |
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))] | |
{% endif %} | |
{% endif %} | |
return result | |
with accelerator.main_process_first(): | |
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=list(set(sum(list(datasets.column_names.values()),[]))), desc="Running tokenizer on dataset" | |
) | |
block_size = {{ block_size }} | |
def group_texts(examples): | |
# Concatenate all texts. | |
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} | |
total_length = len(concatenated_examples[list(examples.keys())[0]]) | |
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can | |
# customize this part to your needs. | |
total_length = (total_length // block_size) * block_size | |
# Split by chunks of max_len. | |
result = { | |
k: [t[i : i + block_size] for i in range(0, total_length, block_size)] | |
for k, t in concatenated_examples.items() | |
} | |
result["labels"] = result["input_ids"].copy() | |
return result | |
with accelerator.main_process_first(): | |
lm_datasets = tokenized_datasets.map( | |
group_texts, | |
batched=True, | |
batch_size=1000, | |
num_proc=4, | |
desc=f"Grouping texts in chunks of {block_size}", | |
) | |
{% if whole_word_masking %} | |
def whole_word_masking_data_collator(features): | |
for feature in features: | |
word_ids = feature.pop("word_ids") | |
# Create a map between words and corresponding token indices | |
mapping = collections.defaultdict(list) | |
current_word_index = -1 | |
current_word = None | |
for idx, word_id in enumerate(word_ids): | |
if word_id is not None: | |
if word_id != current_word: | |
current_word = word_id | |
current_word_index += 1 | |
mapping[current_word_index].append(idx) | |
# Randomly mask words | |
wwm_probability = {{ mlm_probability }} | |
mask = np.random.binomial(1, wwm_probability, (len(mapping),)) | |
input_ids = feature["input_ids"] | |
labels = feature["labels"] | |
new_labels = [-100] * len(labels) | |
for word_id in np.where(mask)[0]: | |
word_id = word_id.item() | |
for idx in mapping[word_id]: | |
new_labels[idx] = labels[idx] | |
input_ids[idx] = tokenizer.mask_token_id | |
return default_data_collator(features) | |
data_collator = whole_word_masking_data_collator | |
{% else %} | |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability={{ mlm_probability }}) | |
{% endif %} | |
def insert_random_mask(batch): | |
features = [dict(zip(batch, t)) for t in zip(*batch.values())] | |
masked_inputs = data_collator(features) | |
# Create a new "masked" column for each column in the dataset | |
return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()} | |
{% if whole_word_masking %} | |
lm_datasetst = lm_datasets.remove_columns(["word_ids"]) | |
{% endif %} | |
with accelerator.main_process_first(): | |
eval_dataset = lm_datasets["{{ validation }}"].map( | |
insert_random_mask, | |
batched=True, | |
remove_columns=lm_datasets["{{ validation }}"].column_names, | |
desc="Inserting a random mask on eval dataset" | |
) | |
eval_dataset = eval_dataset.rename_columns( | |
{ | |
name: name.split('masked_')[1] for name in eval_dataset.features.keys() | |
} | |
) | |
batch_size = {{ batch_size }} | |
train_dataloader = DataLoader( | |
lm_datasets["{{ train }}"], | |
shuffle=True, | |
batch_size=batch_size, | |
collate_fn=data_collator, | |
) | |
eval_dataloader = DataLoader( | |
eval_dataset, batch_size=batch_size, collate_fn=default_data_collator | |
) | |
{{ header("Training") }} | |
{% if use_weight_decay %} | |
weight_decay = {{ weight_decay }} | |
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]): | |
params_with_wd, params_without_wd = [], [] | |
for n, p in model.named_parameters(): | |
if any(nd in n for nd in no_decay): | |
params_without_wd.append(p) | |
else: | |
params_with_wd.append(p) | |
return [ | |
{"params": params_with_wd, "weight_decay": weight_decay}, | |
{"params": params_without_wd, "weight_decay": 0.0}, | |
] | |
optimizer = {{ optimizer }}(get_grouped_params(model), lr={{ lr }}) | |
{% else %} | |
optimizer = {{ optimizer }}(model.parameters(), lr={{ lr }}) | |
{% endif %} | |
accelerator = Accelerator() | |
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( | |
model, optimizer, train_dataloader, eval_dataloader | |
) | |
num_train_epochs = {{ num_epochs }} | |
gradient_accumulation_steps = {{ gradient_accumulation_steps }} | |
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps) | |
max_train_steps = num_train_epochs * num_update_steps_per_epoch | |
output_dir=f"{model_name}-finetuned" | |
lr_scheduler = get_scheduler( | |
'{{ lr_scheduler_type }}', | |
optimizer=optimizer, | |
num_warmup_steps={{ num_warmup_steps }}, | |
num_training_steps=max_train_steps, | |
) | |
progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process) | |
for epoch in range(num_train_epochs): | |
# Training | |
model.train() | |
for step, batch in enumerate(train_dataloader): | |
outputs = model(**batch) | |
loss = outputs.loss / gradient_accumulation_steps | |
accelerator.backward(loss) | |
if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: | |
#TODO Let the user decide on clip grad norm | |
accelerator.clip_grad_norm_(model.parameters(), 1.0) | |
optimizer.step() | |
lr_scheduler.step() | |
optimizer.zero_grad() | |
progress_bar.update(1) | |
# Evaluation | |
model.eval() | |
losses = [] | |
for step, batch in enumerate(eval_dataloader): | |
with torch.no_grad(): | |
outputs = model(**batch) | |
loss = outputs.loss | |
losses.append(accelerator.gather(loss.repeat(batch_size))) | |
losses = torch.cat(losses) | |
losses = losses[: len(eval_dataset)] | |
try: | |
eval_loss = torch.mean(losses) | |
perplexity = math.exp(eval_loss) | |
except OverflowError: | |
perplexity = float("inf") | |
accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity}) | |
model.train() | |
accelerator.wait_for_everyone() | |
unwrapped_model = accelerator.unwrap_model(model) | |
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save) | |
if accelerator.is_main_process: | |
tokenizer.save_pretrained(output_dir) | |
emissions = tracker.stop() | |
accelerator.print(f'Emissions: {emissions} kg') |