# Before running, install required packages: {% if notebook %} ! {%- else %} # {%- endif %} pip install datasets transformers[sentencepiece] accelerate import collections import logging import math import datasets import numpy as np import torch import transformers from accelerate import Accelerator from accelerate.logging import get_logger from accelerate.utils import set_seed from codecarbon import EmissionsTracker from datasets import load_dataset from torch.optim import {{ optimizer }} from torch.utils.data import DataLoader from torch.utils.data.dataloader import DataLoader from tqdm.auto import tqdm from transformers import (AutoConfig, AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments, default_data_collator, get_scheduler) from transformers.utils.versions import require_version {{ header("Setup") }} tracker = EmissionsTracker(log_level='error') tracker.start() logger = get_logger(__name__) require_version("datasets>=1.8.0") accelerator = Accelerator() set_seed({{ seed }}) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.ERROR, ) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() {{ header("Load model and dataset") }} {% if subset == 'default' %} datasets = load_dataset('{{dataset}}') {% else %} datasets = load_dataset('{{dataset}}', '{{ subset }}') {% endif %} model_checkpoint = "{{model_checkpoint}}" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True) {% if pretrained %} model = AutoModelFor{{task}}.from_pretrained(model_checkpoint) {% else %} config = AutoConfig.from_pretrained(model_checkpoint) model = AutoModelFor{{task}}.from_config(config) {% endif %} model.resize_token_embeddings(len(tokenizer)) model_name = model_checkpoint.split("/")[-1] if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token {{ header("Preprocessing") }} def tokenize_function(examples): result = tokenizer(examples["{{ feature }}"]) {% if task=="MaskedLM" %} {% if whole_word_masking %} if tokenizer.is_fast: result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))] {% endif %} {% endif %} return result with accelerator.main_process_first(): tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=list(set(sum(list(datasets.column_names.values()),[]))), desc="Running tokenizer on dataset" ) block_size = {{ block_size }} def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result with accelerator.main_process_first(): lm_datasets = tokenized_datasets.map( group_texts, batched=True, batch_size=1000, num_proc=4, desc=f"Grouping texts in chunks of {block_size}", ) {% if whole_word_masking %} def whole_word_masking_data_collator(features): for feature in features: word_ids = feature.pop("word_ids") # Create a map between words and corresponding token indices mapping = collections.defaultdict(list) current_word_index = -1 current_word = None for idx, word_id in enumerate(word_ids): if word_id is not None: if word_id != current_word: current_word = word_id current_word_index += 1 mapping[current_word_index].append(idx) # Randomly mask words wwm_probability = {{ mlm_probability }} mask = np.random.binomial(1, wwm_probability, (len(mapping),)) input_ids = feature["input_ids"] labels = feature["labels"] new_labels = [-100] * len(labels) for word_id in np.where(mask)[0]: word_id = word_id.item() for idx in mapping[word_id]: new_labels[idx] = labels[idx] input_ids[idx] = tokenizer.mask_token_id return default_data_collator(features) data_collator = whole_word_masking_data_collator {% else %} data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability={{ mlm_probability }}) {% endif %} def insert_random_mask(batch): features = [dict(zip(batch, t)) for t in zip(*batch.values())] masked_inputs = data_collator(features) # Create a new "masked" column for each column in the dataset return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()} {% if whole_word_masking %} lm_datasetst = lm_datasets.remove_columns(["word_ids"]) {% endif %} with accelerator.main_process_first(): eval_dataset = lm_datasets["{{ validation }}"].map( insert_random_mask, batched=True, remove_columns=lm_datasets["{{ validation }}"].column_names, desc="Inserting a random mask on eval dataset" ) eval_dataset = eval_dataset.rename_columns( { name: name.split('masked_')[1] for name in eval_dataset.features.keys() } ) batch_size = {{ batch_size }} train_dataloader = DataLoader( lm_datasets["{{ train }}"], shuffle=True, batch_size=batch_size, collate_fn=data_collator, ) eval_dataloader = DataLoader( eval_dataset, batch_size=batch_size, collate_fn=default_data_collator ) {{ header("Training") }} {% if use_weight_decay %} weight_decay = {{ weight_decay }} def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]): params_with_wd, params_without_wd = [], [] for n, p in model.named_parameters(): if any(nd in n for nd in no_decay): params_without_wd.append(p) else: params_with_wd.append(p) return [ {"params": params_with_wd, "weight_decay": weight_decay}, {"params": params_without_wd, "weight_decay": 0.0}, ] optimizer = {{ optimizer }}(get_grouped_params(model), lr={{ lr }}) {% else %} optimizer = {{ optimizer }}(model.parameters(), lr={{ lr }}) {% endif %} accelerator = Accelerator() model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader ) num_train_epochs = {{ num_epochs }} gradient_accumulation_steps = {{ gradient_accumulation_steps }} num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps) max_train_steps = num_train_epochs * num_update_steps_per_epoch output_dir=f"{model_name}-finetuned" lr_scheduler = get_scheduler( '{{ lr_scheduler_type }}', optimizer=optimizer, num_warmup_steps={{ num_warmup_steps }}, num_training_steps=max_train_steps, ) progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process) for epoch in range(num_train_epochs): # Training model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss / gradient_accumulation_steps accelerator.backward(loss) if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: #TODO Let the user decide on clip grad norm accelerator.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) # Evaluation model.eval() losses = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) loss = outputs.loss losses.append(accelerator.gather(loss.repeat(batch_size))) losses = torch.cat(losses) losses = losses[: len(eval_dataset)] try: eval_loss = torch.mean(losses) perplexity = math.exp(eval_loss) except OverflowError: perplexity = float("inf") accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity}) model.train() accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(output_dir) emissions = tracker.stop() accelerator.print(f'Emissions: {emissions} kg')