Update QA_Tensorflow.py
Browse files- QA_Tensorflow.py +51 -11
QA_Tensorflow.py
CHANGED
@@ -1,19 +1,59 @@
|
|
1 |
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
|
2 |
-
|
3 |
import tensorflow as tf
|
|
|
4 |
|
5 |
checkpoint = "distilbert-base-cased-distilled-squad"
|
6 |
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
7 |
model = TFAutoModelForQuestionAnswering.from_pretrained(checkpoint)
|
8 |
|
9 |
def question_answering_tf(question, context):
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
|
|
|
2 |
import tensorflow as tf
|
3 |
+
import numpy as np
|
4 |
|
5 |
checkpoint = "distilbert-base-cased-distilled-squad"
|
6 |
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
7 |
model = TFAutoModelForQuestionAnswering.from_pretrained(checkpoint)
|
8 |
|
9 |
def question_answering_tf(question, context):
|
10 |
+
inputs = tokenizer(question,
|
11 |
+
context,
|
12 |
+
max_length=384,
|
13 |
+
stride=50,
|
14 |
+
truncation='only_second',
|
15 |
+
padding=True,
|
16 |
+
return_overflowing_tokens=True,
|
17 |
+
return_offsets_mapping=True,
|
18 |
+
return_tensors="tf")
|
19 |
+
|
20 |
+
_ = inputs.pop("overflow_to_sample_mapping")
|
21 |
+
offset_mapping = inputs.pop("offset_mapping")
|
22 |
+
|
23 |
+
outputs = model(inputs)
|
24 |
+
|
25 |
+
start_logits = outputs.start_logits
|
26 |
+
end_logits = outputs.end_logits
|
27 |
+
|
28 |
+
#Masking
|
29 |
+
sequence_ids = inputs.sequence_ids()
|
30 |
+
mask = [i != 1 for i in sequence_ids]
|
31 |
+
mask[0] = False
|
32 |
+
mask = tf.math.logical_or(tf.constant(mask)[None], inputs["attention_mask"] == 0)
|
33 |
+
|
34 |
+
start_logits = tf.where(mask, -10000, start_logits)
|
35 |
+
end_logits = tf.where(mask, -10000, end_logits)
|
36 |
+
|
37 |
+
#Softmax
|
38 |
+
start_probabilities = tf.nn.softmax(start_logits, axis=-1).numpy()
|
39 |
+
end_probabilities = tf.nn.softmax(end_logits, axis=-1).numpy()
|
40 |
+
|
41 |
+
#Finding (start token, end token) pair with best probability score
|
42 |
+
max_score = 0.0
|
43 |
+
start_index,end_index = 0,0
|
44 |
+
for i, probs in enumerate(zip(start_probabilities, end_probabilities)):
|
45 |
+
sp, ep = probs
|
46 |
+
scores = tf.matmul(sp[:,np.newaxis], ep[np.newaxis,:])
|
47 |
+
index = np.triu(scores).argmax().item()
|
48 |
+
row = int(index/384)
|
49 |
+
col = index % 384
|
50 |
+
score = scores[row][col]
|
51 |
+
if(score > max_score):
|
52 |
+
max_score = score
|
53 |
+
start_index = row
|
54 |
+
end_index = col
|
55 |
+
|
56 |
+
#Return characters from context corresponding to start and end of token characters
|
57 |
+
start = int(offset_mapping[i][start_index][0])
|
58 |
+
end = int(offset_mapping[i][end_index][1])
|
59 |
+
return context[start:end+1]
|