File size: 4,810 Bytes
ae81e0f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
"""
Helper functions dataset setup and loading
"""
import os
from os.path import join
import shutil
import numpy as np
from torch.utils.data import Dataset, DataLoader
from datasets import Dataset as HFDataset
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, LlamaTokenizer
from transformers import DataCollatorForSeq2Seq
# from transformers import DefaultDataCollator, DataCollatorWithPadding
def get_seq2seq_loader(dataset: Dataset, tokenizer: AutoTokenizer,
split: str, **loader_kwargs: any):
"""
Get dataloader for seq2seq tasks (evaluation)
"""
tokenizer.padding_side = 'right'
collate_fn = DataCollatorForSeq2Seq(
tokenizer, label_pad_token_id=-100, return_tensors='pt')
return DataLoader(
dataset, shuffle='train' in split, collate_fn=collate_fn, **loader_kwargs)
def get_lm_loader(dataset: Dataset, tokenizer: AutoTokenizer,
split: str, max_length: int = None, **loader_kwargs: any):
"""
Get dataloader for language modeling (training)
-> Currently this ends up being the same as get_seq2seq_loader
"""
# collate_fn = DefaultDataCollator(return_tensors='pt')
# collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, padding=True,
# max_length=max_length, return_tensors='pt')
collate_fn = DataCollatorForSeq2Seq(
tokenizer, label_pad_token_id=-100, return_tensors='pt')
return DataLoader(
dataset, shuffle='train' in split, collate_fn=collate_fn, **loader_kwargs)
def convert_to_hf_dataset(dataset, cache_dir: str):
"""
Convert iterable dataset to HuggingFace HFDataset object
"""
def gen():
for _, sample in enumerate(dataset):
yield sample # dataset[idx]
return HFDataset.from_generator(gen, cache_dir=cache_dir)
def get_tokenizer_from_config(model_config):
"""
Get pretrained tokenizer based on (pretrained) model config
"""
# Get tokenizer
if 'llama' in model_config['pretrained_model_name_or_path']:
try: # if we store locally
model_path = join(model_config['cache_dir'],
model_config['pretrained_model_name_or_path'])
tokenizer = LlamaTokenizer.from_pretrained(model_path)
except Exception as e:
try:
tokenizer = AutoTokenizer.from_pretrained(**model_config)
print("-> Bad LlamaTokenizer.from_pretrained(model_path)", e)
print("-> But resolved with: AutoTokenizer.from_pretrained(**model_config)")
except Exception as e2:
print("-> Error with AutoTokenizer.from_pretrained(**model_config)", e2)
# tokenizer = LlamaTokenizer.from_pretrained(**model_config) # v4.43 errors with `*** TypeError: not a string`
elif 'Mistral-7B-Instruct-v0.3' in model_config['pretrained_model_name_or_path']:
tokenizer = LlamaTokenizer.from_pretrained(**model_config) # hack where AutoTokenizer doesn't recognize
elif 'Mistral-7B' in model_config['pretrained_model_name_or_path']:
tokenizer = AutoTokenizer.from_pretrained(**model_config)
else:
tokenizer = AutoTokenizer.from_pretrained(**model_config)
return tokenizer
def add_special_tokens_to_dataset(dataset, tokenizer):
"""
Add special tokens as attributes to a dataset object
"""
token_map = {k: v for k, v in tokenizer.special_tokens_map.items()}
special_ids = tokenizer.all_special_ids
for idx, k in enumerate(tokenizer.special_tokens_map.keys()):
token_map[f'{k}_id'] = special_ids[idx]
for k, v in token_map.items():
setattr(dataset, k, v)
return dataset
def train_test_split(samples: any, train_size: int, test_size: int, seed: int):
"""
Split samples into train and test sets
"""
try:
assert len(samples) == train_size + test_size
except Exception as e:
print(len(samples), train_size + test_size)
raise e
arange = np.arange(len(samples))
np.random.seed(seed)
test_idx = np.random.choice(arange, size=test_size, replace=False)
train_idx = np.setdiff1d(arange, test_idx)
return samples[train_idx], samples[test_idx]
def download_scrolls_metric():
"""
Download ROUGE, F1, and other accuracy metrics included in the SCROLLS dataset
"""
scrolls_metric_path = hf_hub_download(
repo_id="tau/scrolls", filename="metrics/scrolls.py", repo_type="dataset"
)
updated_scrolls_metric_path = (
os.path.dirname(scrolls_metric_path) +
os.path.basename(scrolls_metric_path).replace(".", "_") + ".py"
)
shutil.copy(scrolls_metric_path, updated_scrolls_metric_path)
return updated_scrolls_metric_path
|