Spaces:

mshukor
/

eP-ALM-Audio-Text

Build error

eP-ALM-Audio-Text / refTools /evaluation /rouge /rouge.py

mshukor

init

33f1db4 almost 2 years ago

3.64 kB

	#!/usr/bin/env python
	#
	# File Name : rouge.py
	#
	# Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
	#
	# Creation Date : 2015-01-07 06:03
	# Author : Ramakrishna Vedantam <[email protected]>

	import numpy as np
	import pdb

	def my_lcs(string, sub):
	"""
	Calculates longest common subsequence for a pair of tokenized strings
	:param string : list of str : tokens from a string split using whitespace
	:param sub : list of str : shorter string, also split using whitespace
	:returns: length (list of int): length of the longest common subsequence between the two strings

	Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
	"""
	if(len(string)< len(sub)):
	sub, string = string, sub

	lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]

	for j in range(1,len(sub)+1):
	for i in range(1,len(string)+1):
	if(string[i-1] == sub[j-1]):
	lengths[i][j] = lengths[i-1][j-1] + 1
	else:
	lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])

	return lengths[len(string)][len(sub)]

	class Rouge():
	'''
	Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set

	'''
	def __init__(self):
	# vrama91: updated the value below based on discussion with Hovey
	self.beta = 1.2

	def calc_score(self, candidate, refs):
	"""
	Compute ROUGE-L score given one candidate and references for an image
	:param candidate: str : candidate sentence to be evaluated
	:param refs: list of str : COCO reference sentences for the particular image to be evaluated
	:returns score: int (ROUGE-L score for the candidate evaluated against references)
	"""
	assert(len(candidate)==1)
	assert(len(refs)>0)
	prec = []
	rec = []

	# split into tokens
	token_c = candidate[0].split(" ")

	for reference in refs:
	# split into tokens
	token_r = reference.split(" ")
	# compute the longest common subsequence
	lcs = my_lcs(token_r, token_c)
	prec.append(lcs/float(len(token_c)))
	rec.append(lcs/float(len(token_r)))

	prec_max = max(prec)
	rec_max = max(rec)

	if(prec_max!=0 and rec_max !=0):
	score = ((1 + self.beta*2)prec_maxrec_max)/float(rec_max + self.beta2prec_max)
	else:
	score = 0.0
	return score

	def compute_score(self, gts, res):
	"""
	Computes Rouge-L score given a set of reference and candidate sentences for the dataset
	Invoked by evaluate_captions.py
	:param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values
	:param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
	:returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
	"""
	assert(gts.keys() == res.keys())
	imgIds = gts.keys()

	score = []
	for id in imgIds:
	hypo = res[id]
	ref = gts[id]

	score.append(self.calc_score(hypo, ref))

	# Sanity check.
	assert(type(hypo) is list)
	assert(len(hypo) == 1)
	assert(type(ref) is list)
	assert(len(ref) > 0)

	average_score = np.mean(np.array(score))
	return average_score, np.array(score)

	def method(self):
	return "Rouge"