SondosMB commited on
Commit
e4f66e8
·
verified ·
1 Parent(s): 24a059f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -9
app.py CHANGED
@@ -45,27 +45,75 @@ def evaluate_predictions(prediction_file):
45
  try:
46
  predictions_df = pd.read_csv(prediction_file.name)
47
  ground_truth_df = pd.read_csv(ground_truth_file)
48
- filename = os.path.basename(prediction_file.name)
49
- model_name = filename.split('_')[1].split('.')[0] if "_" in filename else "unknown_model"
 
 
 
 
 
 
 
 
50
 
51
- merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
 
 
 
 
 
 
52
  merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
53
- correct_predictions = (merged_df['pred_answer'] == merged_df['Answer']).sum()
 
 
54
  total_predictions = len(merged_df)
55
- overall_accuracy = correct_predictions / total_predictions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  results = {
58
  'model_name': model_name,
59
  'overall_accuracy': overall_accuracy,
60
- 'correct_predictions': correct_predictions,
61
  'total_questions': total_predictions,
 
 
 
 
62
  }
63
 
64
- update_leaderboard(results)
 
 
65
 
66
- return "Evaluation completed successfully! Leaderboard updated.", LEADERBOARD_FILE
67
  except Exception as e:
68
- return f"Error: {str(e)}", None
 
69
 
70
  # Gradio Interface with Leaderboard
71
  def display_leaderboard():
 
45
  try:
46
  predictions_df = pd.read_csv(prediction_file.name)
47
  ground_truth_df = pd.read_csv(ground_truth_file)
48
+
49
+ # Extract model name
50
+ try:
51
+ filename = os.path.basename(prediction_file.name)
52
+ if "_" in filename and "." in filename:
53
+ model_name = filename.split('_')[1].split('.')[0]
54
+ else:
55
+ model_name = "unknown_model"
56
+ except IndexError:
57
+ model_name = "unknown_model"
58
 
59
+ # Merge dataframes
60
+ merged_df = pd.merge(
61
+ predictions_df,
62
+ ground_truth_df,
63
+ on='question_id',
64
+ how='inner'
65
+ )
66
  merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
67
+ invalid_predictions = merged_df['pred_answer'].isna().sum()
68
+ valid_predictions = merged_df.dropna(subset=['pred_answer'])
69
+ correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
70
  total_predictions = len(merged_df)
71
+ total_valid_predictions = len(valid_predictions)
72
+
73
+ # Ensure no division by zero
74
+ overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
75
+ valid_accuracy = (
76
+ correct_predictions / total_valid_predictions
77
+ if total_valid_predictions > 0
78
+ else 0
79
+ )
80
+
81
+ field_metrics = {}
82
+ for field in merged_df['Field'].unique():
83
+ field_data = merged_df[merged_df['Field'] == field]
84
+ field_valid_data = field_data.dropna(subset=['pred_answer'])
85
+
86
+ field_correct = (field_valid_data['pred_answer'] == field_valid_data['Answer']).sum()
87
+ field_total = len(field_data)
88
+ field_valid_total = len(field_valid_data)
89
+ field_invalid = field_total - field_valid_total
90
+
91
+ field_metrics[field] = {
92
+ 'accuracy': field_correct / field_total if field_total > 0 else 0,
93
+ 'valid_accuracy': field_correct / field_valid_total if field_valid_total > 0 else 0,
94
+ 'correct': field_correct,
95
+ 'total': field_total,
96
+ 'invalid': field_invalid
97
+ }
98
 
99
  results = {
100
  'model_name': model_name,
101
  'overall_accuracy': overall_accuracy,
102
+ 'valid_accuracy': valid_accuracy,
103
  'total_questions': total_predictions,
104
+ 'valid_predictions': total_valid_predictions,
105
+ 'invalid_predictions': invalid_predictions,
106
+ 'correct_predictions': correct_predictions,
107
+ 'field_performance': field_metrics
108
  }
109
 
110
+ output_file = "evaluation_results.txt"
111
+ write_evaluation_results(results, output_file)
112
+ return "Evaluation completed successfully!", output_file
113
 
 
114
  except Exception as e:
115
+ return f"Error during evaluation: {str(e)}", None
116
+
117
 
118
  # Gradio Interface with Leaderboard
119
  def display_leaderboard():