File size: 2,893 Bytes
4746b5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
import evaluate

raw_datasets = load_dataset("glue","mrpc")
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example['sentence1'], example['sentence2'], truncation=True)

tokenized_dataset = raw_datasets.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['sentence1', 'sentence2','idx'])
tokenized_dataset = tokenized_dataset.rename_column('label','labels')
#print(tokenized_dataset.column_names["train"])

tokenized_dataset.set_format('torch')
#print(tokenized_dataset)

data_collator = DataCollatorWithPadding(tokenizer)

train_dataloader = DataLoader(
    tokenized_dataset['validation'], batch_size=8, collate_fn=data_collator
)

eval_dataloader = DataLoader(
    tokenized_dataset['validation'], batch_size=8, collate_fn=data_collator
)

#for batch in train_dataloader:
#    break
#print({k: v.shape for k, v in batch.items()})
#print()
#print(batch)
#print()

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

#outputs = model(**batch)
#print(outputs.loss, outputs.logits.shape)

optimizer = AdamW(model.parameters(), lr=5e-5)

#loss = outputs.loss
#loss.backward()
#optimizer.step()

#optimizer.zero_grad()

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
model.to(device)
print(f'Using device: {device}')

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

metric= evaluate.load('glue','mrpc')
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch['labels'])

result = metric.compute()
print(result)

save_dir = "/Users/alexandr/Desktop/HUGGING_FACE/model"

model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"model and tokenizer saved to {save_dir}")