# Before running, install required packages: {% if notebook %} ! {%- else %} # {%- endif %} pip install datasets transformers import collections import math import logging import numpy as np import transformers import datasets from datasets import load_dataset from transformers import (AutoConfig, AutoModelForMaskedLM, AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments, default_data_collator, set_seed) from transformers.testing_utils import CaptureLogger from transformers.utils.versions import require_version {{ header("Setup") }} logger = logging.getLogger(__name__) require_version("datasets>=1.8.0") set_seed({{ seed }}) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.ERROR, ) datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() {{ header("Load model and dataset") }} {% if subset == 'default' %} datasets = load_dataset('{{dataset}}') {% else %} datasets = load_dataset('{{dataset}}', '{{ subset }}') {% endif %} model_checkpoint = "{{model_checkpoint}}" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True) {% if pretrained %} model = AutoModelFor{{task}}.from_pretrained(model_checkpoint) {% else %} config = AutoConfig.from_pretrained(model_checkpoint) model = AutoModelFor{{task}}.from_config(config) {% endif %} model.resize_token_embeddings(len(tokenizer)) model_name = model_checkpoint.split("/")[-1] if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token {{ header("Preprocessing") }} # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") def tokenize_function(examples): with CaptureLogger(tok_logger) as cl: result = tokenizer(examples["{{ feature }}"]) if "Token indices sequence length is longer than the" in cl.out: tok_logger.warning( "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits" " before being passed to the model." ) if tokenizer.is_fast: result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))] return result tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=list(set(sum(list(datasets.column_names.values()),[]))), desc="Running tokenizer on dataset" ) block_size = {{ block_size }} def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result lm_datasets = tokenized_datasets.map( group_texts, batched=True, batch_size=1000, num_proc=4, desc=f"Grouping texts in chunks of {block_size}", ) {{ header("Training") }} training_args = TrainingArguments( output_dir=f"{model_name}-finetuned", per_device_train_batch_size={{ batch_size }}, per_device_eval_batch_size={{ batch_size }}, evaluation_strategy='epoch', logging_strategy='epoch', save_strategy='epoch', optim='{{ optimizer }}', learning_rate={{ lr }}, num_train_epochs={{ num_epochs }}, gradient_accumulation_steps={{ gradient_accumulation_steps }}, lr_scheduler_type='{{ lr_scheduler_type }}', warmup_steps={{ num_warmup_steps }}, {% if use_weight_decay%} weight_decay={{ weight_decay }}, {% endif %} push_to_hub=False, dataloader_num_workers=0, {% if task=="MaskedLM" %} {% if whole_word_masking %} remove_unused_columns=False, {% endif %} {% endif %} load_best_model_at_end=True, log_level='error' ) {% if whole_word_masking %} def whole_word_masking_data_collator(features): for feature in features: word_ids = feature.pop("word_ids") # Create a map between words and corresponding token indices mapping = collections.defaultdict(list) current_word_index = -1 current_word = None for idx, word_id in enumerate(word_ids): if word_id is not None: if word_id != current_word: current_word = word_id current_word_index += 1 mapping[current_word_index].append(idx) # Randomly mask words wwm_probability = {{ mlm_probability }} mask = np.random.binomial(1, wwm_probability, (len(mapping),)) input_ids = feature["input_ids"] labels = feature["labels"] new_labels = [-100] * len(labels) for word_id in np.where(mask)[0]: word_id = word_id.item() for idx in mapping[word_id]: new_labels[idx] = labels[idx] input_ids[idx] = tokenizer.mask_token_id return default_data_collator(features) data_collator = whole_word_masking_data_collator {% else %} data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability={{ mlm_probability }}) {% endif %} trainer = Trainer( model=model, args=training_args, train_dataset=lm_datasets["{{ train }}"], eval_dataset=lm_datasets["{{ validation }}"], data_collator=data_collator, ) train_result = trainer.train() trainer.save_model() trainer.log_metrics("train", train_result.metrics) trainer.save_metrics("train", train_result.metrics) trainer.save_state() eval_results = trainer.evaluate() eval_results["perplexity"] = math.exp(eval_results['eval_loss']) print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") trainer.log_metrics("eval", eval_results) trainer.save_metrics("eval", eval_results)