Text2Text Generation
Transformers
PyTorch
Safetensors
English
bart
Generated from Trainer
JohnnyBoy00's picture
Upload evaluation.py
1b7c795
raw
history blame
6.03 kB
import numpy as np
import torch
from evaluate import load as load_metric
from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm
MAX_TARGET_LENGTH = 128
# load evaluation metrics
sacrebleu = load_metric('sacrebleu')
rouge = load_metric('rouge')
meteor = load_metric('meteor')
bertscore = load_metric('bertscore')
# use gpu if it's available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
def flatten_list(l):
"""
Utility function to convert a list of lists into a flattened list
Params:
l (list of lists): list to be flattened
Returns:
A flattened list with the elements of the original list
"""
return [item for sublist in l for item in sublist]
def extract_feedback(predictions):
"""
Utility function to extract the feedback from the predictions of the model
Params:
predictions (list): complete model predictions
Returns:
feedback (list): extracted feedback from the model's predictions
"""
feedback = []
# iterate through predictions and try to extract predicted feedback
for pred in predictions:
try:
fb = pred.split(':', 1)[1]
except IndexError:
try:
if pred.lower().startswith('partially correct'):
fb = pred.split(' ', 1)[2]
else:
fb = pred.split(' ', 1)[1]
except IndexError:
fb = pred
feedback.append(fb.strip())
return feedback
def extract_labels(predictions):
"""
Utility function to extract the labels from the predictions of the model
Params:
predictions (list): complete model predictions
Returns:
feedback (list): extracted labels from the model's predictions
"""
labels = []
for pred in predictions:
if pred.lower().startswith('correct'):
label = 'Correct'
elif pred.lower().startswith('partially correct'):
label = 'Partially correct'
elif pred.lower().startswith('incorrect'):
label = 'Incorrect'
else:
label = 'Unknown label'
labels.append(label)
return labels
def compute_metrics(predictions, labels):
"""
Compute evaluation metrics from the predictions of the model
Params:
predictions (list): complete model predictions
labels (list): golden labels (previously tokenized)
Returns:
results (dict): dictionary with the computed evaluation metrics
predictions (list): list of the decoded predictions of the model
"""
# extract feedback and labels from the model's predictions
predicted_feedback = extract_feedback(predictions)
predicted_labels = extract_labels(predictions)
# extract feedback and labels from the golden labels
reference_feedback = [x.split('Feedback:', 1)[1].strip() for x in labels]
reference_labels = [x.split('Feedback:', 1)[0].strip() for x in labels]
# compute HF metrics
sacrebleu_score = sacrebleu.compute(predictions=predicted_feedback, references=[[x] for x in reference_feedback])['score']
rouge_score = rouge.compute(predictions=predicted_feedback, references=reference_feedback)['rouge2']
meteor_score = meteor.compute(predictions=predicted_feedback, references=reference_feedback)['meteor']
bert_score = bertscore.compute(
predictions=predicted_feedback,
references=reference_feedback,
lang='en',
rescale_with_baseline=True)
# use sklearn to compute accuracy and f1 score
reference_labels_np = np.array(reference_labels)
accuracy = accuracy_score(reference_labels_np, predicted_labels)
f1_weighted = f1_score(reference_labels_np, predicted_labels, average='weighted')
f1_macro = f1_score(
reference_labels_np,
predicted_labels,
average='macro',
labels=['Incorrect', 'Partially correct', 'Correct'])
results = {
'sacrebleu': sacrebleu_score,
'rouge': rouge_score,
'meteor': meteor_score,
'bert_score': np.array(bert_score['f1']).mean().item(),
'accuracy': accuracy,
'f1_weighted': f1_weighted,
'f1_macro': f1_macro
}
return results
def evaluate(model, tokenizer, dataloader):
"""
Evaluate model on the given dataset
Params:
model (PreTrainedModel): seq2seq model
tokenizer (PreTrainedTokenizer): tokenizer from HuggingFace
dataloader (torch Dataloader): dataloader of the dataset to be used for evaluation
Returns:
results (dict): dictionary with the computed evaluation metrics
predictions (list): list of the decoded predictions of the model
"""
decoded_preds, decoded_labels = [], []
model.eval()
# iterate through batchs in the dataloader
for batch in tqdm(dataloader):
with torch.no_grad():
batch = {k: v.to(device) for k, v in batch.items()}
# generate tokens from batch
generated_tokens = model.generate(
batch['input_ids'],
attention_mask=batch['attention_mask'],
max_length=MAX_TARGET_LENGTH
)
# get golden labels from batch
labels_batch = batch['labels']
# decode model predictions and golden labels
decoded_preds_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
decoded_labels_batch = tokenizer.batch_decode(labels_batch, skip_special_tokens=True)
decoded_preds.append(decoded_preds_batch)
decoded_labels.append(decoded_labels_batch)
# convert predictions and golden labels into flattened lists
predictions = flatten_list(decoded_preds)
labels = flatten_list(decoded_labels)
# compute metrics based on predictions and golden labels
results = compute_metrics(predictions, labels)
return results, predictions