# Before running, install required packages: {% if notebook %} ! {%- else %} # {%- endif %} pip install datasets transformers[sentencepiece] accelerate sacrebleu==1.4.14 codecarbon sacremoses import collections import logging import math import random import babel import datasets import numpy as np import torch import transformers from accelerate import Accelerator from accelerate.logging import get_logger from accelerate.utils import set_seed from codecarbon import EmissionsTracker from datasets import load_dataset, load_metric from torch.optim import {{ optimizer }} from torch.utils.data import DataLoader from torch.utils.data.dataloader import DataLoader from tqdm.auto import tqdm from transformers import (AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, MBartTokenizer, MBartTokenizerFast, Trainer, TrainingArguments, default_data_collator, get_scheduler) from transformers.utils.versions import require_version {{ header("Setup") }} tracker = EmissionsTracker(log_level='error') tracker.start() logger = get_logger(__name__) require_version("datasets>=1.8.0") accelerator = Accelerator() set_seed({{ seed }}) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.ERROR, ) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() {{ header("Load model and dataset") }} {% if subset == 'default' %} datasets = load_dataset('{{dataset}}') {% else %} datasets = load_dataset('{{dataset}}', '{{ subset }}') {% endif %} metric = load_metric("sacrebleu") model_checkpoint = "{{model_checkpoint}}" tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True) {% if pretrained %} model = AutoModelFor{{task}}.from_pretrained(model_checkpoint) {% else %} config = AutoConfig.from_pretrained(model_checkpoint) model = AutoModelFor{{task}}.from_config(config) {% endif %} model.resize_token_embeddings(len(tokenizer)) model_name = model_checkpoint.split("/")[-1] {{ header("Preprocessing") }} source_lang = '{{ source_language }}' target_lang = '{{ target_language }}' {% if 'mbart' in model_checkpoint %} # Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): assert ( target_lang is not None and source_lang is not None ), "mBart requires --target_lang and --source_lang" if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[target_lang] else: model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(target_lang) {% endif %} {% if 't5' in model_checkpoint %} if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]: for language in (source_lang, target_lang): if language != language[:2]: logging.warning( 'Extended language code %s not supported. Falling back on %s.', language, language[:2] ) lang_id_to_string = { source_lang: babel.Locale(source_lang[:2]).english_name, target_lang: babel.Locale(target_lang[:2]).english_name, } src_str = 'translate {}'.format(lang_id_to_string[source_lang]) tgt_str = ' to {}: '.format(lang_id_to_string[target_lang]) prefix = src_str + tgt_str else: prefix = "" {% else %} prefix = "" {% endif %} {% if 'mbart' in model_checkpoint %} # For translation we set the codes of our source and target languages (only useful for mBART, the others will # ignore those attributes). if isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): label = ['ar_AR', 'cs_CZ', 'de_DE', 'en_XX', 'es_XX', 'et_EE', 'fi_FI', 'fr_XX', 'gu_IN', 'hi_IN', 'it_IT', 'ja_XX', 'kk_KZ', 'ko_KR', 'lt_LT', 'lv_LV', 'my_MM', 'ne_NP', 'nl_XX', 'ro_RO', 'ru_RU', 'si_LK', 'tr_TR', 'vi_VN', 'zh_CN'] source_code = [item for item in label if item.startswith(source_lang)][0] target_code = [item for item in label if item.startswith(target_lang)][0] if source_lang is not None: tokenizer.src_lang = source_code if target_lang is not None: tokenizer.tgt_lang = target_code {% endif %} max_input_length = {{ block_size }} max_target_length = {{ block_size }} def preprocess_function(examples): inputs = [prefix + ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, truncation=True) model_inputs["labels"] = labels["input_ids"] return model_inputs with accelerator.main_process_first(): tokenized_datasets = datasets.map(preprocess_function, batched=True, num_proc=4, remove_columns=list( set(sum(list(datasets.column_names.values()), []))), desc="Running tokenizer on dataset") data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, pad_to_multiple_of=8 if accelerator.use_fp16 else None) batch_size = {{ batch_size }} train_dataloader = DataLoader(tokenized_datasets["{{ train }}"], batch_size=batch_size, shuffle=True, collate_fn=data_collator) eval_dataloader = DataLoader(tokenized_datasets["{{ validation }}"], batch_size=batch_size, collate_fn=data_collator) {{ header("Training") }} def compute_metrics(eval_preds): preds, labels = eval_preds # In case the model returns more than the prediction logits if isinstance(preds, tuple): preds = preds[0] decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) # Replace -100s in the labels as we can't decode them labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing decoded_preds = [pred.strip() for pred in decoded_preds] decoded_labels = [[label.strip()] for label in decoded_labels] result = metric.compute(predictions=decoded_preds, references=decoded_labels) return {"bleu": result["score"]} def postprocess(predictions, labels): predictions = predictions.cpu().numpy() labels = labels.cpu().numpy() decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) # Some simple post-processing decoded_preds = [pred.strip() for pred in decoded_preds] decoded_labels = [[label.strip()] for label in decoded_labels] return decoded_preds, decoded_labels {% if use_weight_decay %} weight_decay = {{ weight_decay }} def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]): params_with_wd, params_without_wd = [], [] for n, p in model.named_parameters(): if any(nd in n for nd in no_decay): params_without_wd.append(p) else: params_with_wd.append(p) return [ {"params": params_with_wd, "weight_decay": weight_decay}, {"params": params_without_wd, "weight_decay": 0.0}, ] optimizer = {{ optimizer }}(get_grouped_params(model), lr={{ lr }}) {% else %} optimizer = {{ optimizer }}(model.parameters(), lr={{ lr }}) {% endif %} model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader ) num_train_epochs = {{ num_epochs }} gradient_accumulation_steps = {{ gradient_accumulation_steps }} num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps) max_train_steps = num_train_epochs * num_update_steps_per_epoch output_dir=f"{model_name}-finetuned" lr_scheduler = get_scheduler( '{{ lr_scheduler_type }}', optimizer=optimizer, num_warmup_steps={{ num_warmup_steps }}, num_training_steps=max_train_steps, ) progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process) for epoch in range(num_train_epochs): # Training model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / gradient_accumulation_steps accelerator.backward(loss) if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) # Evaluation model.eval() samples_seen = 0 for step, batch in enumerate(eval_dataloader): with torch.no_grad(): generated_tokens = accelerator.unwrap_model(model).generate( batch["input_ids"], attention_mask=batch["attention_mask"], max_length=128, ) labels = batch["labels"] # Necessary to pad predictions and labels for being gathered generated_tokens = accelerator.pad_across_processes( generated_tokens, dim=1, pad_index=tokenizer.pad_token_id ) labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100) predictions_gathered = accelerator.gather(generated_tokens) labels_gathered = accelerator.gather(labels) decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered) if accelerator.num_processes > 1: if step == len(eval_dataloader) - 1: decoded_preds = decoded_preds[: len( eval_dataloader.dataset) - samples_seen] decoded_labels = decoded_labels[: len( eval_dataloader.dataset) - samples_seen] else: samples_seen += len(decoded_labels) metric.add_batch(predictions=decoded_preds, references=decoded_labels) results = metric.compute() print(f"epoch {epoch}, BLEU score: {results['score']:.2f}") # Save and upload accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(output_dir) emissions = tracker.stop() print(f'Emissions: {emissions} kg')