demo / backend /examine_strict_results.py
tfrere's picture
update lighteval results
39acd70
raw
history blame
3.35 kB
import pandas as pd
import sys
import re
from pprint import pprint
# Chemins vers les fichiers parquet
parquet_file_original = "data/lighteval_results/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-30-41.628107/details_custom|yourbench|0_2025-04-01T13-30-41.628107.parquet"
parquet_file_strict = "data/lighteval_results_strict_20250401_134031/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-41-03.260648/details_custom|yourbench|0_2025-04-01T13-41-03.260648.parquet"
try:
# Charger les fichiers parquet
print("Chargement des données...")
df_original = pd.read_parquet(parquet_file_original)
df_strict = pd.read_parquet(parquet_file_strict)
# Afficher des informations de base
print(f"Nombre d'exemples originaux: {len(df_original)}")
print(f"Nombre d'exemples stricts: {len(df_strict)}")
print(f"Scores originaux: {[metric.get('accuracy', 'N/A') for metric in df_original['metrics']]}")
print(f"Scores stricts: {[metric.get('accuracy', 'N/A') for metric in df_strict['metrics']]}")
print("\n" + "="*80 + "\n")
print("COMPARAISON DES RÉSULTATS")
print("="*80 + "\n")
# Comparer les résultats
for i in range(min(len(df_original), len(df_strict))):
print(f"EXEMPLE {i+1}:")
# Question
question_orig = df_original.iloc[i].specifics.get('question', 'N/A')
question_strict = df_strict.iloc[i].specifics.get('question', 'N/A')
print(f"Question: {question_orig}")
# Évaluation
score_orig = df_original.iloc[i].metrics.get('accuracy', 'N/A')
score_strict = df_strict.iloc[i].metrics.get('accuracy', 'N/A')
print(f"Score original: {score_orig}")
print(f"Score strict: {score_strict}")
# Réponses
model_answer_orig = df_original.iloc[i].predictions[0] if len(df_original.iloc[i].predictions) > 0 else "N/A"
model_answer_strict = df_strict.iloc[i].predictions[0] if len(df_strict.iloc[i].predictions) > 0 else "N/A"
# Référence
reference_orig = df_original.iloc[i].choices[0] if len(df_original.iloc[i].choices) > 0 else "N/A"
reference_strict = df_strict.iloc[i].choices[0] if len(df_strict.iloc[i].choices) > 0 else "N/A"
# Comparaison des réponses - si identiques ou différentes
responses_identical = model_answer_orig == model_answer_strict
references_identical = reference_orig == reference_strict
print(f"Réponses du modèle identiques: {'Oui' if responses_identical else 'Non'}")
print(f"Références identiques: {'Oui' if references_identical else 'Non'}")
# Afficher le changement qui a mené à une modification du résultat
if score_orig != score_strict:
print(f"\nRaison possible du changement de score:")
print(f" Critères d'évaluation plus stricts dans le prompt système")
print(f" Rejet des réponses contenant des nuances (however, but, although, etc.)")
print("-"*80 + "\n")
except Exception as e:
print(f"Erreur: {e}")
if "df_original" in locals():
print("\nColonnes dans df_original:", df_original.columns.tolist())
if "df_strict" in locals():
print("\nColonnes dans df_strict:", df_strict.columns.tolist())