Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import re | |
import os | |
from pprint import pprint | |
# Chemin vers le fichier de log du juge | |
log_file = "lighteval_judge.log" | |
# Fonction pour extraire les évaluations du juge | |
def extract_judge_evaluations(log_content): | |
# Pattern pour trouver les réponses du juge | |
pattern = r"Judge response: (.*?)(?=Judge response:|$)" | |
# Extraire toutes les réponses | |
responses = re.findall(pattern, log_content, re.DOTALL) | |
# Analyser chaque réponse pour extraire la décision finale | |
evaluations = [] | |
for i, response in enumerate(responses): | |
# Chercher la décision finale dans les balises XML | |
final_answer_match = re.search(r"<final_answer>(.*?)</final_answer>", response, re.DOTALL) | |
if final_answer_match: | |
final_answer = final_answer_match.group(1).strip() | |
evaluations.append({ | |
"id": i+1, | |
"final_answer": final_answer, | |
"full_response": response[:500] + "..." if len(response) > 500 else response | |
}) | |
else: | |
# Si pas de balise XML, chercher des mots-clés | |
if re.search(r"\b(correct|vrai|true|yes|1)\b", response, re.IGNORECASE): | |
final_answer = "1 (déduit sans balise XML)" | |
elif re.search(r"\b(incorrect|faux|false|no|0)\b", response, re.IGNORECASE): | |
final_answer = "0 (déduit sans balise XML)" | |
else: | |
final_answer = "Non détecté" | |
evaluations.append({ | |
"id": i+1, | |
"final_answer": final_answer, | |
"full_response": response[:500] + "..." if len(response) > 500 else response | |
}) | |
return evaluations | |
# Fonction pour extraire les requêtes envoyées au juge | |
def extract_judge_prompts(log_content): | |
# Pattern pour trouver les requêtes | |
pattern = r"Prompt sent to judge: (.*?)(?=Prompt sent to judge:|Judge response:|$)" | |
# Extraire toutes les requêtes | |
prompts = re.findall(pattern, log_content, re.DOTALL) | |
# Analyser chaque requête | |
analyzed_prompts = [] | |
for i, prompt in enumerate(prompts): | |
# Extraire les questions, réponses et réponses de référence | |
question_match = re.search(r"<question>(.*?)</question>", prompt, re.DOTALL) | |
model_answer_match = re.search(r"<model_answer>(.*?)</model_answer>", prompt, re.DOTALL) | |
gold_answer_match = re.search(r"<gold_answer>(.*?)</gold_answer>", prompt, re.DOTALL) | |
question = question_match.group(1).strip() if question_match else "Non détecté" | |
model_answer = model_answer_match.group(1).strip() if model_answer_match else "Non détecté" | |
gold_answer = gold_answer_match.group(1).strip() if gold_answer_match else "Non détecté" | |
analyzed_prompts.append({ | |
"id": i+1, | |
"question": question, | |
"model_answer": model_answer[:200] + "..." if len(model_answer) > 200 else model_answer, | |
"gold_answer": gold_answer[:200] + "..." if len(gold_answer) > 200 else gold_answer | |
}) | |
return analyzed_prompts | |
# Lire le fichier de log | |
if os.path.exists(log_file): | |
with open(log_file, 'r', encoding='utf-8') as f: | |
log_content = f.read() | |
# Extraire les évaluations | |
evaluations = extract_judge_evaluations(log_content) | |
# Extraire les prompts | |
prompts = extract_judge_prompts(log_content) | |
# Afficher le résumé des évaluations | |
print(f"Nombre total d'évaluations: {len(evaluations)}") | |
print("\nRésumé des décisions:") | |
decisions = {} | |
for eval in evaluations: | |
decision = eval["final_answer"] | |
decisions[decision] = decisions.get(decision, 0) + 1 | |
for decision, count in decisions.items(): | |
print(f" {decision}: {count} fois ({count/len(evaluations)*100:.1f}%)") | |
# Afficher les détails des évaluations | |
print("\n" + "="*80) | |
print("DÉTAIL DES COMPARAISONS QUESTION/RÉPONSE/RÉFÉRENCE/DÉCISION") | |
print("="*80 + "\n") | |
for i in range(min(len(prompts), len(evaluations))): | |
prompt = prompts[i] | |
eval = evaluations[i] | |
print(f"EXEMPLE {i+1}:") | |
print(f"Question: {prompt['question']}") | |
print(f"\nRéponse du modèle: {prompt['model_answer']}") | |
print(f"\nRéponse de référence: {prompt['gold_answer']}") | |
print(f"\nDécision du juge: {eval['final_answer']}") | |
print(f"\nExtrait de la réponse complète du juge:") | |
print(eval['full_response'][:300] + "..." if len(eval['full_response']) > 300 else eval['full_response']) | |
print("\n" + "-"*80 + "\n") | |
else: | |
print(f"Fichier de log {log_file} non trouvé.") |