nalinaksh commited on
Commit
dccc40c
·
1 Parent(s): 79bb388

Update QA_Tensorflow.py

Browse files
Files changed (1) hide show
  1. QA_Tensorflow.py +51 -11
QA_Tensorflow.py CHANGED
@@ -1,19 +1,59 @@
1
  from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
2
-
3
  import tensorflow as tf
 
4
 
5
  checkpoint = "distilbert-base-cased-distilled-squad"
6
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
7
  model = TFAutoModelForQuestionAnswering.from_pretrained(checkpoint)
8
 
9
  def question_answering_tf(question, context):
10
- inputs = tokenizer(question, context, return_tensors="tf")
11
- #print(inputs["input_ids"])
12
- #print(tokenizer.decode(inputs["input_ids"][0]))
13
- outputs = model(inputs)
14
- #print(outputs.start_logits)
15
- #print(outputs.end_logits)
16
- start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
17
- end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
18
- print(start_index, end_index)
19
- return tokenizer.decode(inputs["input_ids"][0][start_index: end_index+1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
 
2
  import tensorflow as tf
3
+ import numpy as np
4
 
5
  checkpoint = "distilbert-base-cased-distilled-squad"
6
  tokenizer = AutoTokenizer.from_pretrained(checkpoint)
7
  model = TFAutoModelForQuestionAnswering.from_pretrained(checkpoint)
8
 
9
  def question_answering_tf(question, context):
10
+ inputs = tokenizer(question,
11
+ context,
12
+ max_length=384,
13
+ stride=50,
14
+ truncation='only_second',
15
+ padding=True,
16
+ return_overflowing_tokens=True,
17
+ return_offsets_mapping=True,
18
+ return_tensors="tf")
19
+
20
+ _ = inputs.pop("overflow_to_sample_mapping")
21
+ offset_mapping = inputs.pop("offset_mapping")
22
+
23
+ outputs = model(inputs)
24
+
25
+ start_logits = outputs.start_logits
26
+ end_logits = outputs.end_logits
27
+
28
+ #Masking
29
+ sequence_ids = inputs.sequence_ids()
30
+ mask = [i != 1 for i in sequence_ids]
31
+ mask[0] = False
32
+ mask = tf.math.logical_or(tf.constant(mask)[None], inputs["attention_mask"] == 0)
33
+
34
+ start_logits = tf.where(mask, -10000, start_logits)
35
+ end_logits = tf.where(mask, -10000, end_logits)
36
+
37
+ #Softmax
38
+ start_probabilities = tf.nn.softmax(start_logits, axis=-1).numpy()
39
+ end_probabilities = tf.nn.softmax(end_logits, axis=-1).numpy()
40
+
41
+ #Finding (start token, end token) pair with best probability score
42
+ max_score = 0.0
43
+ start_index,end_index = 0,0
44
+ for i, probs in enumerate(zip(start_probabilities, end_probabilities)):
45
+ sp, ep = probs
46
+ scores = tf.matmul(sp[:,np.newaxis], ep[np.newaxis,:])
47
+ index = np.triu(scores).argmax().item()
48
+ row = int(index/384)
49
+ col = index % 384
50
+ score = scores[row][col]
51
+ if(score > max_score):
52
+ max_score = score
53
+ start_index = row
54
+ end_index = col
55
+
56
+ #Return characters from context corresponding to start and end of token characters
57
+ start = int(offset_mapping[i][start_index][0])
58
+ end = int(offset_mapping[i][end_index][1])
59
+ return context[start:end+1]