Spaces:
Running
Running
File size: 5,623 Bytes
acd7cf4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
"""Evaluate the quality of the generated text using various metrics"""
import os
import json
import argparse
import pandas as pd
from dotenv import load_dotenv
from .models import LengthEvaluator, MTLDEvaluator, RewardEvaluator, TextPair, UniEvaluator
from .utils import logger, set_logger
sys_path = os.path.abspath(os.path.dirname(__file__))
set_logger(os.path.join(sys_path, "cache", "logs", "evaluate.log"))
load_dotenv()
def evaluate_length(corpus, tokenizer_name):
length_evaluator = LengthEvaluator(
tokenizer_name=tokenizer_name
)
logger.info("Length evaluator loaded")
scores = length_evaluator.get_average_score(corpus)
logger.info("Length scores: %s", scores)
return scores
def evaluate_mtld(corpus):
mtld_evaluator = MTLDEvaluator()
logger.info("MTLD evaluator loaded")
scores = mtld_evaluator.get_average_score(corpus)
logger.info("MTLD scores: %s", scores)
min_max_scores = mtld_evaluator.get_min_max_score(corpus)
logger.info("MTLD min max scores: %s", min_max_scores)
return scores, min_max_scores
def evaluate_reward(corpus, reward_model_names):
scores = []
for reward_name in reward_model_names:
reward_evaluator = RewardEvaluator(
reward_name=reward_name
)
logger.info("Loaded reward model: %s", reward_name)
average_score = reward_evaluator.get_average_score(corpus)
logger.info("%s scores: %s", reward_name, average_score)
min_max_scores = reward_evaluator.get_min_max_score(corpus)
logger.info("%s min max scores: %s", reward_name, min_max_scores)
scores.append({
'reward_name': reward_name.split('/')[-1],
'score': average_score,
'min_max_scores': min_max_scores
})
del reward_evaluator
clean_gpu_cache()
return scores
def evaluate_uni(corpus, uni_model_name):
uni_evaluator = UniEvaluator(
model_name=uni_model_name
)
logger.info("Uni evaluator loaded with model %s", uni_model_name)
uni_scores = uni_evaluator.get_average_score(corpus)
for key, value in uni_scores.items():
logger.info("Uni %s scores: %s", key, value)
min_max_scores = uni_evaluator.get_min_max_score(corpus)
for key, value in min_max_scores.items():
logger.info("Uni %s min max scores: %s", key, value)
del uni_evaluator
clean_gpu_cache()
return (uni_scores['naturalness'], uni_scores['coherence'], uni_scores['understandability'],
min_max_scores['naturalness'], min_max_scores['coherence'], min_max_scores['understandability'])
def clean_gpu_cache():
import torch
if torch.cuda.is_available():
torch.cuda.empty_cache()
if __name__ == '__main__':
import torch.multiprocessing as mp
parser = argparse.ArgumentParser()
parser.add_argument('--folder', type=str, default='cache/data', help='folder to load data')
parser.add_argument('--output', type=str, default='cache/output', help='path to save output')
parser.add_argument('--tokenizer', type=str, default='cl100k_base', help='tokenizer name')
parser.add_argument('--reward', type=str, default='OpenAssistant/reward-model-deberta-v3-large-v2',
help='Comma-separated list of reward models')
parser.add_argument('--uni', type=str, default='MingZhong/unieval-sum', help='uni model name')
args = parser.parse_args()
if not os.path.exists(args.folder):
raise ValueError(f"Folder {args.folder} does not exist")
if not os.path.exists(args.output):
os.makedirs(args.output)
reward_models = args.reward.split(',')
results = []
logger.info("Data loaded from %s", args.folder)
mp.set_start_method('spawn')
for file in os.listdir(args.folder):
if file.endswith('.json'):
logger.info("Processing %s", file)
with open(os.path.join(args.folder, file), 'r', encoding='utf-8') as f:
data = json.load(f)
data = [TextPair(
question=data[key]['question'],
answer=data[key]['answer']
) for key in data]
length_scores = evaluate_length(data, args.tokenizer)
mtld_scores, min_max_mtld_scores = evaluate_mtld(data)
reward_scores = evaluate_reward(data, reward_models)
uni_naturalness_scores, uni_coherence_scores, uni_understandability_scores, \
min_max_uni_naturalness_scores, min_max_uni_coherence_scores, min_max_uni_understandability_scores \
= evaluate_uni(data, args.uni)
result = {
'file': file,
'number': len(data),
'length': length_scores,
'mtld': mtld_scores,
'mtld_min_max': min_max_mtld_scores,
'uni_naturalness': uni_naturalness_scores,
'uni_coherence': uni_coherence_scores,
'uni_understandability': uni_understandability_scores,
'uni_naturalness_min_max': min_max_uni_naturalness_scores,
'uni_coherence_min_max': min_max_uni_coherence_scores,
'uni_understandability_min_max': min_max_uni_understandability_scores
}
for reward_score in reward_scores:
result[reward_score['reward_name']] = reward_score['score']
result[f"{reward_score['reward_name']}_min_max"] = reward_score['min_max_scores']
results.append(result)
results = pd.DataFrame(results)
results.to_csv(os.path.join(args.output, 'evaluation.csv'), index=False)
|