kkawamu1's picture
Commit codes
3f13a7b
# Before running, install required packages:
{% if notebook %}
!
{%- else %}
#
{%- endif %}
pip install datasets transformers
import collections
import math
import logging
import numpy as np
import transformers
import datasets
from datasets import load_dataset
from transformers import (AutoConfig, AutoModelForMaskedLM, AutoTokenizer,
DataCollatorForLanguageModeling, Trainer,
TrainingArguments, default_data_collator, set_seed)
from transformers.testing_utils import CaptureLogger
from transformers.utils.versions import require_version
{{ header("Setup") }}
logger = logging.getLogger(__name__)
require_version("datasets>=1.8.0")
set_seed({{ seed }})
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.ERROR,
)
datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_info()
{{ header("Load model and dataset") }}
{% if subset == 'default' %}
datasets = load_dataset('{{dataset}}')
{% else %}
datasets = load_dataset('{{dataset}}', '{{ subset }}')
{% endif %}
model_checkpoint = "{{model_checkpoint}}"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
{% if pretrained %}
model = AutoModelFor{{task}}.from_pretrained(model_checkpoint)
{% else %}
config = AutoConfig.from_pretrained(model_checkpoint)
model = AutoModelFor{{task}}.from_config(config)
{% endif %}
model.resize_token_embeddings(len(tokenizer))
model_name = model_checkpoint.split("/")[-1]
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
{{ header("Preprocessing") }}
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
def tokenize_function(examples):
with CaptureLogger(tok_logger) as cl:
result = tokenizer(examples["{{ feature }}"])
if "Token indices sequence length is longer than the" in cl.out:
tok_logger.warning(
"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
" before being passed to the model."
)
if tokenizer.is_fast:
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
return result
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=list(set(sum(list(datasets.column_names.values()),[]))), desc="Running tokenizer on dataset"
)
block_size = {{ block_size }}
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.
total_length = (total_length // block_size) * block_size
# Split by chunks of max_len.
result = {
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
for k, t in concatenated_examples.items()
}
result["labels"] = result["input_ids"].copy()
return result
lm_datasets = tokenized_datasets.map(
group_texts,
batched=True,
batch_size=1000,
num_proc=4,
desc=f"Grouping texts in chunks of {block_size}",
)
{{ header("Training") }}
training_args = TrainingArguments(
output_dir=f"{model_name}-finetuned",
per_device_train_batch_size={{ batch_size }},
per_device_eval_batch_size={{ batch_size }},
evaluation_strategy='epoch',
logging_strategy='epoch',
save_strategy='epoch',
optim='{{ optimizer }}',
learning_rate={{ lr }},
num_train_epochs={{ num_epochs }},
gradient_accumulation_steps={{ gradient_accumulation_steps }},
lr_scheduler_type='{{ lr_scheduler_type }}',
warmup_steps={{ num_warmup_steps }},
{% if use_weight_decay%}
weight_decay={{ weight_decay }},
{% endif %}
push_to_hub=False,
dataloader_num_workers=0,
{% if task=="MaskedLM" %}
{% if whole_word_masking %}
remove_unused_columns=False,
{% endif %}
{% endif %}
load_best_model_at_end=True,
log_level='error'
)
{% if whole_word_masking %}
def whole_word_masking_data_collator(features):
for feature in features:
word_ids = feature.pop("word_ids")
# Create a map between words and corresponding token indices
mapping = collections.defaultdict(list)
current_word_index = -1
current_word = None
for idx, word_id in enumerate(word_ids):
if word_id is not None:
if word_id != current_word:
current_word = word_id
current_word_index += 1
mapping[current_word_index].append(idx)
# Randomly mask words
wwm_probability = {{ mlm_probability }}
mask = np.random.binomial(1, wwm_probability, (len(mapping),))
input_ids = feature["input_ids"]
labels = feature["labels"]
new_labels = [-100] * len(labels)
for word_id in np.where(mask)[0]:
word_id = word_id.item()
for idx in mapping[word_id]:
new_labels[idx] = labels[idx]
input_ids[idx] = tokenizer.mask_token_id
return default_data_collator(features)
data_collator = whole_word_masking_data_collator
{% else %}
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability={{ mlm_probability }})
{% endif %}
trainer = Trainer(
model=model,
args=training_args,
train_dataset=lm_datasets["{{ train }}"],
eval_dataset=lm_datasets["{{ validation }}"],
data_collator=data_collator,
)
train_result = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()
eval_results = trainer.evaluate()
eval_results["perplexity"] = math.exp(eval_results['eval_loss'])
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
trainer.log_metrics("eval", eval_results)
trainer.save_metrics("eval", eval_results)