Spaces:
Runtime error
Runtime error
# Before running, install required packages: | |
{% if notebook %} | |
! | |
{%- else %} | |
# | |
{%- endif %} | |
pip install datasets transformers | |
import collections | |
import math | |
import logging | |
import numpy as np | |
import transformers | |
import datasets | |
from datasets import load_dataset | |
from transformers import (AutoConfig, AutoModelForMaskedLM, AutoTokenizer, | |
DataCollatorForLanguageModeling, Trainer, | |
TrainingArguments, default_data_collator, set_seed) | |
from transformers.testing_utils import CaptureLogger | |
from transformers.utils.versions import require_version | |
{{ header("Setup") }} | |
logger = logging.getLogger(__name__) | |
require_version("datasets>=1.8.0") | |
set_seed({{ seed }}) | |
logging.basicConfig( | |
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", | |
datefmt="%m/%d/%Y %H:%M:%S", | |
level=logging.ERROR, | |
) | |
datasets.utils.logging.set_verbosity_warning() | |
transformers.utils.logging.set_verbosity_info() | |
{{ header("Load model and dataset") }} | |
{% if subset == 'default' %} | |
datasets = load_dataset('{{dataset}}') | |
{% else %} | |
datasets = load_dataset('{{dataset}}', '{{ subset }}') | |
{% endif %} | |
model_checkpoint = "{{model_checkpoint}}" | |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True) | |
{% if pretrained %} | |
model = AutoModelFor{{task}}.from_pretrained(model_checkpoint) | |
{% else %} | |
config = AutoConfig.from_pretrained(model_checkpoint) | |
model = AutoModelFor{{task}}.from_config(config) | |
{% endif %} | |
model.resize_token_embeddings(len(tokenizer)) | |
model_name = model_checkpoint.split("/")[-1] | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.eos_token | |
{{ header("Preprocessing") }} | |
# since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function | |
tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base") | |
def tokenize_function(examples): | |
with CaptureLogger(tok_logger) as cl: | |
result = tokenizer(examples["{{ feature }}"]) | |
if "Token indices sequence length is longer than the" in cl.out: | |
tok_logger.warning( | |
"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits" | |
" before being passed to the model." | |
) | |
if tokenizer.is_fast: | |
result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))] | |
return result | |
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=list(set(sum(list(datasets.column_names.values()),[]))), desc="Running tokenizer on dataset" | |
) | |
block_size = {{ block_size }} | |
def group_texts(examples): | |
# Concatenate all texts. | |
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} | |
total_length = len(concatenated_examples[list(examples.keys())[0]]) | |
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can | |
# customize this part to your needs. | |
total_length = (total_length // block_size) * block_size | |
# Split by chunks of max_len. | |
result = { | |
k: [t[i : i + block_size] for i in range(0, total_length, block_size)] | |
for k, t in concatenated_examples.items() | |
} | |
result["labels"] = result["input_ids"].copy() | |
return result | |
lm_datasets = tokenized_datasets.map( | |
group_texts, | |
batched=True, | |
batch_size=1000, | |
num_proc=4, | |
desc=f"Grouping texts in chunks of {block_size}", | |
) | |
{{ header("Training") }} | |
training_args = TrainingArguments( | |
output_dir=f"{model_name}-finetuned", | |
per_device_train_batch_size={{ batch_size }}, | |
per_device_eval_batch_size={{ batch_size }}, | |
evaluation_strategy='epoch', | |
logging_strategy='epoch', | |
save_strategy='epoch', | |
optim='{{ optimizer }}', | |
learning_rate={{ lr }}, | |
num_train_epochs={{ num_epochs }}, | |
gradient_accumulation_steps={{ gradient_accumulation_steps }}, | |
lr_scheduler_type='{{ lr_scheduler_type }}', | |
warmup_steps={{ num_warmup_steps }}, | |
{% if use_weight_decay%} | |
weight_decay={{ weight_decay }}, | |
{% endif %} | |
push_to_hub=False, | |
dataloader_num_workers=0, | |
{% if task=="MaskedLM" %} | |
{% if whole_word_masking %} | |
remove_unused_columns=False, | |
{% endif %} | |
{% endif %} | |
load_best_model_at_end=True, | |
log_level='error' | |
) | |
{% if whole_word_masking %} | |
def whole_word_masking_data_collator(features): | |
for feature in features: | |
word_ids = feature.pop("word_ids") | |
# Create a map between words and corresponding token indices | |
mapping = collections.defaultdict(list) | |
current_word_index = -1 | |
current_word = None | |
for idx, word_id in enumerate(word_ids): | |
if word_id is not None: | |
if word_id != current_word: | |
current_word = word_id | |
current_word_index += 1 | |
mapping[current_word_index].append(idx) | |
# Randomly mask words | |
wwm_probability = {{ mlm_probability }} | |
mask = np.random.binomial(1, wwm_probability, (len(mapping),)) | |
input_ids = feature["input_ids"] | |
labels = feature["labels"] | |
new_labels = [-100] * len(labels) | |
for word_id in np.where(mask)[0]: | |
word_id = word_id.item() | |
for idx in mapping[word_id]: | |
new_labels[idx] = labels[idx] | |
input_ids[idx] = tokenizer.mask_token_id | |
return default_data_collator(features) | |
data_collator = whole_word_masking_data_collator | |
{% else %} | |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability={{ mlm_probability }}) | |
{% endif %} | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
train_dataset=lm_datasets["{{ train }}"], | |
eval_dataset=lm_datasets["{{ validation }}"], | |
data_collator=data_collator, | |
) | |
train_result = trainer.train() | |
trainer.save_model() | |
trainer.log_metrics("train", train_result.metrics) | |
trainer.save_metrics("train", train_result.metrics) | |
trainer.save_state() | |
eval_results = trainer.evaluate() | |
eval_results["perplexity"] = math.exp(eval_results['eval_loss']) | |
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") | |
trainer.log_metrics("eval", eval_results) | |
trainer.save_metrics("eval", eval_results) |