Upload 2 files
Browse files- LUKE_pipe.py +122 -0
- meta.py +104 -0
LUKE_pipe.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
|
2 |
+
import numpy as np
|
3 |
+
from tqdm import tqdm
|
4 |
+
import torch
|
5 |
+
import collections
|
6 |
+
|
7 |
+
luke_beam_size = 5
|
8 |
+
n_best = 30
|
9 |
+
max_length = 512
|
10 |
+
stride = 128
|
11 |
+
batch_size = 8
|
12 |
+
n_best = 20
|
13 |
+
max_answer_length = 30
|
14 |
+
|
15 |
+
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
16 |
+
luke_model = AutoModelForQuestionAnswering.from_pretrained("botcon/LUKE_squadshift_finetuned_large").to(device)
|
17 |
+
luke_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
|
18 |
+
|
19 |
+
def compute_beam(start_logits, end_logits, features, examples):
|
20 |
+
example_to_features = collections.defaultdict(list)
|
21 |
+
for idx, feature in enumerate(features):
|
22 |
+
example_to_features[feature["example_id"]].append(idx)
|
23 |
+
|
24 |
+
predicted_answers = []
|
25 |
+
for example in tqdm(examples):
|
26 |
+
example_id = example["id"]
|
27 |
+
context = example["context"]
|
28 |
+
answers = []
|
29 |
+
|
30 |
+
# Loop through all features associated with that example
|
31 |
+
for feature_index in example_to_features[example_id]:
|
32 |
+
start_logit = start_logits[feature_index]
|
33 |
+
end_logit = end_logits[feature_index]
|
34 |
+
offsets = features[feature_index]["offset_mapping"]
|
35 |
+
|
36 |
+
start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
|
37 |
+
end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
|
38 |
+
for start_index in start_indexes:
|
39 |
+
for end_index in end_indexes:
|
40 |
+
# Skip answers that are not fully in the context
|
41 |
+
if offsets[start_index] is None or offsets[end_index] is None:
|
42 |
+
continue
|
43 |
+
# Skip answers with a length that is either < 0 or > max_answer_length
|
44 |
+
if (
|
45 |
+
end_index < start_index
|
46 |
+
or end_index - start_index + 1 > max_answer_length
|
47 |
+
):
|
48 |
+
continue
|
49 |
+
|
50 |
+
answer = {
|
51 |
+
"text": context[offsets[start_index][0] : offsets[end_index][1]],
|
52 |
+
"logit_score": start_logit[start_index] + end_logit[end_index],
|
53 |
+
}
|
54 |
+
answers.append(answer)
|
55 |
+
|
56 |
+
# Select the answer with the best score
|
57 |
+
if len(answers) > 0:
|
58 |
+
best_answers = sorted(answers, key=lambda x: x["logit_score"], reverse=True)
|
59 |
+
best_ans = []
|
60 |
+
best_logits = []
|
61 |
+
i = 0
|
62 |
+
while i < len(best_answers[:luke_beam_size]):
|
63 |
+
best_ans.append(best_answers[i]["text"])
|
64 |
+
best_logits.append(best_answers[i]["logit_score"])
|
65 |
+
i += 1
|
66 |
+
while i < luke_beam_size:
|
67 |
+
best_ans.append("")
|
68 |
+
best_logits.append(1e-5) # treat this as negative infinity
|
69 |
+
i += 1
|
70 |
+
|
71 |
+
predicted_answers.append({"id":example_id, "prediction_text": best_ans, "logits": best_logits})
|
72 |
+
else:
|
73 |
+
predicted_answers.append({"id": example_id, "prediction_text": ""})
|
74 |
+
|
75 |
+
return predicted_answers
|
76 |
+
|
77 |
+
def preprocess_validation_examples(examples):
|
78 |
+
questions = [q.strip() for q in examples["question"]]
|
79 |
+
inputs = luke_tokenizer(
|
80 |
+
questions,
|
81 |
+
examples["context"],
|
82 |
+
max_length=max_length,
|
83 |
+
truncation="only_second",
|
84 |
+
stride=stride,
|
85 |
+
return_overflowing_tokens=True,
|
86 |
+
return_offsets_mapping=True,
|
87 |
+
padding="max_length",
|
88 |
+
)
|
89 |
+
|
90 |
+
|
91 |
+
sample_map = inputs.pop("overflow_to_sample_mapping")
|
92 |
+
example_ids = []
|
93 |
+
|
94 |
+
for i in range(len(inputs["input_ids"])):
|
95 |
+
sample_idx = sample_map[i]
|
96 |
+
example_ids.append(examples["id"][sample_idx])
|
97 |
+
|
98 |
+
sequence_ids = inputs.sequence_ids(i)
|
99 |
+
offset = inputs["offset_mapping"][i]
|
100 |
+
inputs["offset_mapping"][i] = [
|
101 |
+
o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
|
102 |
+
]
|
103 |
+
|
104 |
+
inputs["example_id"] = example_ids
|
105 |
+
return inputs
|
106 |
+
|
107 |
+
def generate(dataset):
|
108 |
+
luke_model.eval()
|
109 |
+
with torch.no_grad():
|
110 |
+
preprocessed = dataset.map(
|
111 |
+
preprocess_validation_examples,
|
112 |
+
batched=True,
|
113 |
+
remove_columns=dataset.column_names
|
114 |
+
)
|
115 |
+
eval_set_for_model = preprocessed.remove_columns(["example_id", "offset_mapping"])
|
116 |
+
eval_set_for_model.set_format("torch")
|
117 |
+
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
|
118 |
+
outputs = luke_model(**batch)
|
119 |
+
start_logits = outputs.start_logits.cpu().numpy()
|
120 |
+
end_logits = outputs.end_logits.cpu().numpy()
|
121 |
+
res = compute_beam(start_logits, end_logits, preprocessed, dataset)
|
122 |
+
return res
|
meta.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch.nn as nn
|
2 |
+
import torch
|
3 |
+
from transformers import AutoTokenizer, BertForSequenceClassification, PreTrainedModel, PretrainedConfig
|
4 |
+
from transformers.modeling_outputs import SequenceClassifierOutput
|
5 |
+
from torch.nn import CrossEntropyLoss
|
6 |
+
from torch.optim import AdamW
|
7 |
+
from LUKE_pipe import generate
|
8 |
+
from datasets import load_dataset
|
9 |
+
from accelerate import Accelerator
|
10 |
+
|
11 |
+
MAX_BEAM = 10
|
12 |
+
|
13 |
+
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
|
14 |
+
|
15 |
+
class ClassifierAdapter(nn.Module):
|
16 |
+
def __init__(self, l1=3):
|
17 |
+
super().__init__()
|
18 |
+
self.linear1 = nn.Linear(l1, 1)
|
19 |
+
|
20 |
+
self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
21 |
+
self.bert = BertForSequenceClassification.from_pretrained("botcon/right_span_bert")
|
22 |
+
self.relu = nn.ReLU()
|
23 |
+
|
24 |
+
def forward(self, questions, answers, logits):
|
25 |
+
beam_size = len(answers[0])
|
26 |
+
samples = len(questions)
|
27 |
+
questions = [question for _ in range(len(answers[0])) for question in questions]
|
28 |
+
answers = [answer for beam in answers for answer in beam]
|
29 |
+
input = self.tokenizer(
|
30 |
+
questions,
|
31 |
+
answers,
|
32 |
+
padding="max_length",
|
33 |
+
return_tensors="pt"
|
34 |
+
).to(device)
|
35 |
+
bert_logits = self.bert(**input).logits
|
36 |
+
bert_logits = bert_logits.reshape(samples, beam_size, 2)
|
37 |
+
logits = torch.FloatTensor(logits).to(device).unsqueeze(-1)
|
38 |
+
logits = torch.cat((logits, bert_logits), dim=-1)
|
39 |
+
logits = self.relu(logits)
|
40 |
+
out = torch.squeeze(self.linear1(logits), dim=-1)
|
41 |
+
return out
|
42 |
+
|
43 |
+
class HuggingWrapper(PreTrainedModel):
|
44 |
+
config_class = PretrainedConfig()
|
45 |
+
def __init__(self, config):
|
46 |
+
super().__init__(config)
|
47 |
+
self.model = ClassifierAdapter()
|
48 |
+
|
49 |
+
def forward(self, **kwargs):
|
50 |
+
labels = kwargs.pop("labels")
|
51 |
+
output = self.model(**kwargs)
|
52 |
+
loss_fn = CrossEntropyLoss(ignore_index=MAX_BEAM)
|
53 |
+
loss = loss_fn(output, labels)
|
54 |
+
return SequenceClassifierOutput(logits=output, loss=loss)
|
55 |
+
|
56 |
+
model = HuggingWrapper.from_pretrained("botcon/special_bert").to(device)
|
57 |
+
|
58 |
+
accelerator = Accelerator()
|
59 |
+
optimizer = AdamW(model.parameters())
|
60 |
+
|
61 |
+
num_epoch = 2
|
62 |
+
|
63 |
+
raw_datasets = load_dataset("squad")
|
64 |
+
raw_train = raw_datasets["train"]
|
65 |
+
batch_size = 2
|
66 |
+
|
67 |
+
for epoch in range(num_epoch):
|
68 |
+
start = 0
|
69 |
+
end = batch_size
|
70 |
+
|
71 |
+
training_data = raw_train
|
72 |
+
model.train()
|
73 |
+
while start < len(training_data):
|
74 |
+
optimizer.zero_grad()
|
75 |
+
batch_data = raw_train.select(range(start, min(end, len(training_data))))
|
76 |
+
with torch.no_grad():
|
77 |
+
res = generate(batch_data)
|
78 |
+
prediction = []
|
79 |
+
predicted_logit = []
|
80 |
+
labels = []
|
81 |
+
for i in range(len(res)):
|
82 |
+
x = res[i]
|
83 |
+
ground_answer = batch_data["answers"][i]["text"][0]
|
84 |
+
predicted_text = x["prediction_text"]
|
85 |
+
found = False
|
86 |
+
for k in range(len(predicted_text)):
|
87 |
+
if predicted_text[k] == ground_answer:
|
88 |
+
labels.append(k)
|
89 |
+
found = True
|
90 |
+
break
|
91 |
+
if not found:
|
92 |
+
labels.append(MAX_BEAM)
|
93 |
+
prediction.append(predicted_text)
|
94 |
+
predicted_logit.append(x["logits"])
|
95 |
+
labels = torch.LongTensor(labels).to(device)
|
96 |
+
classifier_out = model(questions=batch_data["question"] , answers=prediction, logits=predicted_logit, labels=labels)
|
97 |
+
loss = classifier_out.loss
|
98 |
+
print(loss.item())
|
99 |
+
loss.backward()
|
100 |
+
optimizer.step()
|
101 |
+
start += batch_size
|
102 |
+
end += batch_size
|
103 |
+
|
104 |
+
model.push_to_hub("some_fake_bert")
|