import pandas as pd import sys import re from pprint import pprint # Chemins vers les fichiers parquet parquet_file_original = "data/lighteval_results/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-30-41.628107/details_custom|yourbench|0_2025-04-01T13-30-41.628107.parquet" parquet_file_strict = "data/lighteval_results_strict_20250401_134031/details/Qwen/Qwen2.5-72B-Instruct/2025-04-01T13-41-03.260648/details_custom|yourbench|0_2025-04-01T13-41-03.260648.parquet" try: # Charger les fichiers parquet print("Chargement des données...") df_original = pd.read_parquet(parquet_file_original) df_strict = pd.read_parquet(parquet_file_strict) # Afficher des informations de base print(f"Nombre d'exemples originaux: {len(df_original)}") print(f"Nombre d'exemples stricts: {len(df_strict)}") print(f"Scores originaux: {[metric.get('accuracy', 'N/A') for metric in df_original['metrics']]}") print(f"Scores stricts: {[metric.get('accuracy', 'N/A') for metric in df_strict['metrics']]}") print("\n" + "="*80 + "\n") print("COMPARAISON DES RÉSULTATS") print("="*80 + "\n") # Comparer les résultats for i in range(min(len(df_original), len(df_strict))): print(f"EXEMPLE {i+1}:") # Question question_orig = df_original.iloc[i].specifics.get('question', 'N/A') question_strict = df_strict.iloc[i].specifics.get('question', 'N/A') print(f"Question: {question_orig}") # Évaluation score_orig = df_original.iloc[i].metrics.get('accuracy', 'N/A') score_strict = df_strict.iloc[i].metrics.get('accuracy', 'N/A') print(f"Score original: {score_orig}") print(f"Score strict: {score_strict}") # Réponses model_answer_orig = df_original.iloc[i].predictions[0] if len(df_original.iloc[i].predictions) > 0 else "N/A" model_answer_strict = df_strict.iloc[i].predictions[0] if len(df_strict.iloc[i].predictions) > 0 else "N/A" # Référence reference_orig = df_original.iloc[i].choices[0] if len(df_original.iloc[i].choices) > 0 else "N/A" reference_strict = df_strict.iloc[i].choices[0] if len(df_strict.iloc[i].choices) > 0 else "N/A" # Comparaison des réponses - si identiques ou différentes responses_identical = model_answer_orig == model_answer_strict references_identical = reference_orig == reference_strict print(f"Réponses du modèle identiques: {'Oui' if responses_identical else 'Non'}") print(f"Références identiques: {'Oui' if references_identical else 'Non'}") # Afficher le changement qui a mené à une modification du résultat if score_orig != score_strict: print(f"\nRaison possible du changement de score:") print(f" Critères d'évaluation plus stricts dans le prompt système") print(f" Rejet des réponses contenant des nuances (however, but, although, etc.)") print("-"*80 + "\n") except Exception as e: print(f"Erreur: {e}") if "df_original" in locals(): print("\nColonnes dans df_original:", df_original.columns.tolist()) if "df_strict" in locals(): print("\nColonnes dans df_strict:", df_strict.columns.tolist())