|
from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering |
|
import tensorflow as tf |
|
import numpy as np |
|
|
|
checkpoint = "distilbert-base-cased-distilled-squad" |
|
tokenizer = AutoTokenizer.from_pretrained(checkpoint) |
|
model = TFAutoModelForQuestionAnswering.from_pretrained(checkpoint) |
|
|
|
def question_answering_tf(question, context): |
|
inputs = tokenizer(question, |
|
context, |
|
max_length=384, |
|
stride=50, |
|
truncation='only_second', |
|
padding=True, |
|
return_overflowing_tokens=True, |
|
return_offsets_mapping=True, |
|
return_tensors="tf") |
|
|
|
_ = inputs.pop("overflow_to_sample_mapping") |
|
offset_mapping = inputs.pop("offset_mapping") |
|
|
|
outputs = model(inputs) |
|
|
|
start_logits = outputs.start_logits |
|
end_logits = outputs.end_logits |
|
|
|
|
|
sequence_ids = inputs.sequence_ids() |
|
mask = [i != 1 for i in sequence_ids] |
|
mask[0] = False |
|
mask = tf.math.logical_or(tf.constant(mask)[None], inputs["attention_mask"] == 0) |
|
|
|
start_logits = tf.where(mask, -10000, start_logits) |
|
end_logits = tf.where(mask, -10000, end_logits) |
|
|
|
|
|
start_probabilities = tf.nn.softmax(start_logits, axis=-1).numpy() |
|
end_probabilities = tf.nn.softmax(end_logits, axis=-1).numpy() |
|
|
|
|
|
max_score = 0.0 |
|
start_token = 0 |
|
end_token = 0 |
|
offset_index = 0 |
|
|
|
for i, probs in enumerate(zip(start_probabilities, end_probabilities)): |
|
sp, ep = probs |
|
scores = sp[:,np.newaxis] * ep[np.newaxis,:] |
|
index = np.triu(scores).argmax().item() |
|
row = index // scores.shape[1] |
|
col = index % scores.shape[1] |
|
score = scores[row][col] |
|
if(score > max_score): |
|
max_score = score |
|
start_token = row |
|
end_token = col |
|
offset_index = i |
|
|
|
|
|
start_char, _ = offset_mapping[offset_index][start_token] |
|
_, end_char = offset_mapping[offset_index][end_token] |
|
return context[start_char:end_char+1] |