Spaces:
Build error
Build error
#!/usr/bin/env python | |
# | |
# File Name : rouge.py | |
# | |
# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) | |
# | |
# Creation Date : 2015-01-07 06:03 | |
# Author : Ramakrishna Vedantam <[email protected]> | |
import numpy as np | |
import pdb | |
def my_lcs(string, sub): | |
""" | |
Calculates longest common subsequence for a pair of tokenized strings | |
:param string : list of str : tokens from a string split using whitespace | |
:param sub : list of str : shorter string, also split using whitespace | |
:returns: length (list of int): length of the longest common subsequence between the two strings | |
Note: my_lcs only gives length of the longest common subsequence, not the actual LCS | |
""" | |
if(len(string)< len(sub)): | |
sub, string = string, sub | |
lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)] | |
for j in range(1,len(sub)+1): | |
for i in range(1,len(string)+1): | |
if(string[i-1] == sub[j-1]): | |
lengths[i][j] = lengths[i-1][j-1] + 1 | |
else: | |
lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1]) | |
return lengths[len(string)][len(sub)] | |
class Rouge(): | |
''' | |
Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set | |
''' | |
def __init__(self): | |
# vrama91: updated the value below based on discussion with Hovey | |
self.beta = 1.2 | |
def calc_score(self, candidate, refs): | |
""" | |
Compute ROUGE-L score given one candidate and references for an image | |
:param candidate: str : candidate sentence to be evaluated | |
:param refs: list of str : COCO reference sentences for the particular image to be evaluated | |
:returns score: int (ROUGE-L score for the candidate evaluated against references) | |
""" | |
assert(len(candidate)==1) | |
assert(len(refs)>0) | |
prec = [] | |
rec = [] | |
# split into tokens | |
token_c = candidate[0].split(" ") | |
for reference in refs: | |
# split into tokens | |
token_r = reference.split(" ") | |
# compute the longest common subsequence | |
lcs = my_lcs(token_r, token_c) | |
prec.append(lcs/float(len(token_c))) | |
rec.append(lcs/float(len(token_r))) | |
prec_max = max(prec) | |
rec_max = max(rec) | |
if(prec_max!=0 and rec_max !=0): | |
score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) | |
else: | |
score = 0.0 | |
return score | |
def compute_score(self, gts, res): | |
""" | |
Computes Rouge-L score given a set of reference and candidate sentences for the dataset | |
Invoked by evaluate_captions.py | |
:param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values | |
:param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values | |
:returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) | |
""" | |
assert(gts.keys() == res.keys()) | |
imgIds = gts.keys() | |
score = [] | |
for id in imgIds: | |
hypo = res[id] | |
ref = gts[id] | |
score.append(self.calc_score(hypo, ref)) | |
# Sanity check. | |
assert(type(hypo) is list) | |
assert(len(hypo) == 1) | |
assert(type(ref) is list) | |
assert(len(ref) > 0) | |
average_score = np.mean(np.array(score)) | |
return average_score, np.array(score) | |
def method(self): | |
return "Rouge" | |