botcon commited on
Commit
a8b5bd2
·
1 Parent(s): e5ce4aa

Upload 2 files

Browse files
Files changed (2) hide show
  1. LUKE_pipe.py +122 -0
  2. meta.py +104 -0
LUKE_pipe.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering
2
+ import numpy as np
3
+ from tqdm import tqdm
4
+ import torch
5
+ import collections
6
+
7
+ luke_beam_size = 5
8
+ n_best = 30
9
+ max_length = 512
10
+ stride = 128
11
+ batch_size = 8
12
+ n_best = 20
13
+ max_answer_length = 30
14
+
15
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
16
+ luke_model = AutoModelForQuestionAnswering.from_pretrained("botcon/LUKE_squadshift_finetuned_large").to(device)
17
+ luke_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
18
+
19
+ def compute_beam(start_logits, end_logits, features, examples):
20
+ example_to_features = collections.defaultdict(list)
21
+ for idx, feature in enumerate(features):
22
+ example_to_features[feature["example_id"]].append(idx)
23
+
24
+ predicted_answers = []
25
+ for example in tqdm(examples):
26
+ example_id = example["id"]
27
+ context = example["context"]
28
+ answers = []
29
+
30
+ # Loop through all features associated with that example
31
+ for feature_index in example_to_features[example_id]:
32
+ start_logit = start_logits[feature_index]
33
+ end_logit = end_logits[feature_index]
34
+ offsets = features[feature_index]["offset_mapping"]
35
+
36
+ start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
37
+ end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
38
+ for start_index in start_indexes:
39
+ for end_index in end_indexes:
40
+ # Skip answers that are not fully in the context
41
+ if offsets[start_index] is None or offsets[end_index] is None:
42
+ continue
43
+ # Skip answers with a length that is either < 0 or > max_answer_length
44
+ if (
45
+ end_index < start_index
46
+ or end_index - start_index + 1 > max_answer_length
47
+ ):
48
+ continue
49
+
50
+ answer = {
51
+ "text": context[offsets[start_index][0] : offsets[end_index][1]],
52
+ "logit_score": start_logit[start_index] + end_logit[end_index],
53
+ }
54
+ answers.append(answer)
55
+
56
+ # Select the answer with the best score
57
+ if len(answers) > 0:
58
+ best_answers = sorted(answers, key=lambda x: x["logit_score"], reverse=True)
59
+ best_ans = []
60
+ best_logits = []
61
+ i = 0
62
+ while i < len(best_answers[:luke_beam_size]):
63
+ best_ans.append(best_answers[i]["text"])
64
+ best_logits.append(best_answers[i]["logit_score"])
65
+ i += 1
66
+ while i < luke_beam_size:
67
+ best_ans.append("")
68
+ best_logits.append(1e-5) # treat this as negative infinity
69
+ i += 1
70
+
71
+ predicted_answers.append({"id":example_id, "prediction_text": best_ans, "logits": best_logits})
72
+ else:
73
+ predicted_answers.append({"id": example_id, "prediction_text": ""})
74
+
75
+ return predicted_answers
76
+
77
+ def preprocess_validation_examples(examples):
78
+ questions = [q.strip() for q in examples["question"]]
79
+ inputs = luke_tokenizer(
80
+ questions,
81
+ examples["context"],
82
+ max_length=max_length,
83
+ truncation="only_second",
84
+ stride=stride,
85
+ return_overflowing_tokens=True,
86
+ return_offsets_mapping=True,
87
+ padding="max_length",
88
+ )
89
+
90
+
91
+ sample_map = inputs.pop("overflow_to_sample_mapping")
92
+ example_ids = []
93
+
94
+ for i in range(len(inputs["input_ids"])):
95
+ sample_idx = sample_map[i]
96
+ example_ids.append(examples["id"][sample_idx])
97
+
98
+ sequence_ids = inputs.sequence_ids(i)
99
+ offset = inputs["offset_mapping"][i]
100
+ inputs["offset_mapping"][i] = [
101
+ o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
102
+ ]
103
+
104
+ inputs["example_id"] = example_ids
105
+ return inputs
106
+
107
+ def generate(dataset):
108
+ luke_model.eval()
109
+ with torch.no_grad():
110
+ preprocessed = dataset.map(
111
+ preprocess_validation_examples,
112
+ batched=True,
113
+ remove_columns=dataset.column_names
114
+ )
115
+ eval_set_for_model = preprocessed.remove_columns(["example_id", "offset_mapping"])
116
+ eval_set_for_model.set_format("torch")
117
+ batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
118
+ outputs = luke_model(**batch)
119
+ start_logits = outputs.start_logits.cpu().numpy()
120
+ end_logits = outputs.end_logits.cpu().numpy()
121
+ res = compute_beam(start_logits, end_logits, preprocessed, dataset)
122
+ return res
meta.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch
3
+ from transformers import AutoTokenizer, BertForSequenceClassification, PreTrainedModel, PretrainedConfig
4
+ from transformers.modeling_outputs import SequenceClassifierOutput
5
+ from torch.nn import CrossEntropyLoss
6
+ from torch.optim import AdamW
7
+ from LUKE_pipe import generate
8
+ from datasets import load_dataset
9
+ from accelerate import Accelerator
10
+
11
+ MAX_BEAM = 10
12
+
13
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
14
+
15
+ class ClassifierAdapter(nn.Module):
16
+ def __init__(self, l1=3):
17
+ super().__init__()
18
+ self.linear1 = nn.Linear(l1, 1)
19
+
20
+ self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
21
+ self.bert = BertForSequenceClassification.from_pretrained("botcon/right_span_bert")
22
+ self.relu = nn.ReLU()
23
+
24
+ def forward(self, questions, answers, logits):
25
+ beam_size = len(answers[0])
26
+ samples = len(questions)
27
+ questions = [question for _ in range(len(answers[0])) for question in questions]
28
+ answers = [answer for beam in answers for answer in beam]
29
+ input = self.tokenizer(
30
+ questions,
31
+ answers,
32
+ padding="max_length",
33
+ return_tensors="pt"
34
+ ).to(device)
35
+ bert_logits = self.bert(**input).logits
36
+ bert_logits = bert_logits.reshape(samples, beam_size, 2)
37
+ logits = torch.FloatTensor(logits).to(device).unsqueeze(-1)
38
+ logits = torch.cat((logits, bert_logits), dim=-1)
39
+ logits = self.relu(logits)
40
+ out = torch.squeeze(self.linear1(logits), dim=-1)
41
+ return out
42
+
43
+ class HuggingWrapper(PreTrainedModel):
44
+ config_class = PretrainedConfig()
45
+ def __init__(self, config):
46
+ super().__init__(config)
47
+ self.model = ClassifierAdapter()
48
+
49
+ def forward(self, **kwargs):
50
+ labels = kwargs.pop("labels")
51
+ output = self.model(**kwargs)
52
+ loss_fn = CrossEntropyLoss(ignore_index=MAX_BEAM)
53
+ loss = loss_fn(output, labels)
54
+ return SequenceClassifierOutput(logits=output, loss=loss)
55
+
56
+ model = HuggingWrapper.from_pretrained("botcon/special_bert").to(device)
57
+
58
+ accelerator = Accelerator()
59
+ optimizer = AdamW(model.parameters())
60
+
61
+ num_epoch = 2
62
+
63
+ raw_datasets = load_dataset("squad")
64
+ raw_train = raw_datasets["train"]
65
+ batch_size = 2
66
+
67
+ for epoch in range(num_epoch):
68
+ start = 0
69
+ end = batch_size
70
+
71
+ training_data = raw_train
72
+ model.train()
73
+ while start < len(training_data):
74
+ optimizer.zero_grad()
75
+ batch_data = raw_train.select(range(start, min(end, len(training_data))))
76
+ with torch.no_grad():
77
+ res = generate(batch_data)
78
+ prediction = []
79
+ predicted_logit = []
80
+ labels = []
81
+ for i in range(len(res)):
82
+ x = res[i]
83
+ ground_answer = batch_data["answers"][i]["text"][0]
84
+ predicted_text = x["prediction_text"]
85
+ found = False
86
+ for k in range(len(predicted_text)):
87
+ if predicted_text[k] == ground_answer:
88
+ labels.append(k)
89
+ found = True
90
+ break
91
+ if not found:
92
+ labels.append(MAX_BEAM)
93
+ prediction.append(predicted_text)
94
+ predicted_logit.append(x["logits"])
95
+ labels = torch.LongTensor(labels).to(device)
96
+ classifier_out = model(questions=batch_data["question"] , answers=prediction, logits=predicted_logit, labels=labels)
97
+ loss = classifier_out.loss
98
+ print(loss.item())
99
+ loss.backward()
100
+ optimizer.step()
101
+ start += batch_size
102
+ end += batch_size
103
+
104
+ model.push_to_hub("some_fake_bert")