SondosMB commited on
Commit
7fcd557
·
verified ·
1 Parent(s): b79739e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +223 -122
app.py CHANGED
@@ -1,3 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import os
@@ -5,6 +183,7 @@ import re
5
  from datetime import datetime
6
 
7
  LEADERBOARD_FILE = "leaderboard.csv" # File to store leaderboard data
 
8
 
9
  def clean_answer(answer):
10
  if pd.isna(answer):
@@ -12,53 +191,17 @@ def clean_answer(answer):
12
  answer = str(answer)
13
  clean = re.sub(r'[^A-Da-d]', '', answer)
14
  if clean:
15
- first_letter = clean[0].upper()
16
- if first_letter in ['A', 'B', 'C', 'D']:
17
- return first_letter
18
  return None
19
 
20
- def write_evaluation_results(results, output_file):
21
- os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
22
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
23
-
24
- output_text = [
25
- f"Evaluation Results for Model: {results['model_name']}",
26
- f"Timestamp: {timestamp}",
27
- "-" * 50,
28
- f"Overall Accuracy (including invalid): {results['overall_accuracy']:.2%}",
29
- f"Accuracy (valid predictions only): {results['valid_accuracy']:.2%}",
30
- f"Total Questions: {results['total_questions']}",
31
- f"Valid Predictions: {results['valid_predictions']}",
32
- f"Invalid/Malformed Predictions: {results['invalid_predictions']}",
33
- f"Correct Predictions: {results['correct_predictions']}",
34
- "\nPerformance by Field:",
35
- "-" * 50
36
- ]
37
-
38
- for field, metrics in results['field_performance'].items():
39
- field_results = [
40
- f"\nField: {field}",
41
- f"Accuracy (including invalid): {metrics['accuracy']:.2%}",
42
- f"Accuracy (valid only): {metrics['valid_accuracy']:.2%}",
43
- f"Correct: {metrics['correct']}/{metrics['total']}",
44
- f"Invalid predictions: {metrics['invalid']}"
45
- ]
46
- output_text.extend(field_results)
47
-
48
- with open(output_file, 'w') as f:
49
- f.write('\n'.join(output_text))
50
- print('\n'.join(output_text))
51
- print(f"\nResults have been saved to: {output_file}")
52
-
53
  def update_leaderboard(results):
54
- # Add results to the leaderboard file
55
  new_entry = {
56
  "Model Name": results['model_name'],
57
- "Overall Accuracy": f"{results['overall_accuracy']:.2%}",
58
- "Valid Accuracy": f"{results['valid_accuracy']:.2%}",
59
  "Correct Predictions": results['correct_predictions'],
60
  "Total Questions": results['total_questions'],
61
- "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
62
  }
63
  leaderboard_df = pd.DataFrame([new_entry])
64
  if os.path.exists(LEADERBOARD_FILE):
@@ -66,111 +209,69 @@ def update_leaderboard(results):
66
  leaderboard_df = pd.concat([existing_df, leaderboard_df], ignore_index=True)
67
  leaderboard_df.to_csv(LEADERBOARD_FILE, index=False)
68
 
69
- def display_leaderboard():
70
- if not os.path.exists(LEADERBOARD_FILE):
71
- return "Leaderboard is empty."
72
- leaderboard_df = pd.read_csv(LEADERBOARD_FILE)
73
- return leaderboard_df.to_markdown(index=False)
74
-
75
  def evaluate_predictions(prediction_file):
76
- ground_truth_file = "ground_truth.csv" # Specify the path to the ground truth file
77
- if not prediction_file:
78
- return "Prediction file not uploaded", None
79
-
80
  if not os.path.exists(ground_truth_file):
81
- return "Ground truth file not found", None
 
 
82
 
83
  try:
84
  predictions_df = pd.read_csv(prediction_file.name)
85
  ground_truth_df = pd.read_csv(ground_truth_file)
86
-
87
- # Extract model name
88
- try:
89
- filename = os.path.basename(prediction_file.name)
90
- if "_" in filename and "." in filename:
91
- model_name = filename.split('_')[1].split('.')[0]
92
- else:
93
- model_name = "unknown_model"
94
- except IndexError:
95
- model_name = "unknown_model"
96
-
97
- # Merge dataframes
98
- merged_df = pd.merge(
99
- predictions_df,
100
- ground_truth_df,
101
- on='question_id',
102
- how='inner'
103
- )
104
  merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
105
- invalid_predictions = merged_df['pred_answer'].isna().sum()
106
  valid_predictions = merged_df.dropna(subset=['pred_answer'])
107
  correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
108
  total_predictions = len(merged_df)
109
  total_valid_predictions = len(valid_predictions)
110
 
111
  overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
112
- valid_accuracy = (
113
- correct_predictions / total_valid_predictions
114
- if total_valid_predictions > 0
115
- else 0
116
- )
117
-
118
- field_metrics = {}
119
- for field in merged_df['Field'].unique():
120
- field_data = merged_df[merged_df['Field'] == field]
121
- field_valid_data = field_data.dropna(subset=['pred_answer'])
122
-
123
- field_correct = (field_valid_data['pred_answer'] == field_valid_data['Answer']).sum()
124
- field_total = len(field_data)
125
- field_valid_total = len(field_valid_data)
126
- field_invalid = field_total - field_valid_total
127
-
128
- field_metrics[field] = {
129
- 'accuracy': field_correct / field_total if field_total > 0 else 0,
130
- 'valid_accuracy': field_correct / field_valid_total if field_valid_total > 0 else 0,
131
- 'correct': field_correct,
132
- 'total': field_total,
133
- 'invalid': field_invalid
134
- }
135
 
136
  results = {
137
  'model_name': model_name,
138
  'overall_accuracy': overall_accuracy,
139
  'valid_accuracy': valid_accuracy,
140
- 'total_questions': total_predictions,
141
- 'valid_predictions': total_valid_predictions,
142
- 'invalid_predictions': invalid_predictions,
143
  'correct_predictions': correct_predictions,
144
- 'field_performance': field_metrics
145
  }
146
 
147
  update_leaderboard(results)
148
- output_file = "evaluation_results.txt"
149
- write_evaluation_results(results, output_file)
150
- return "Evaluation completed successfully! Leaderboard updated.", output_file
151
-
152
  except Exception as e:
153
- return f"Error during evaluation: {str(e)}", None
154
 
155
- # Gradio Interface
156
- description = "Upload a prediction CSV file to evaluate predictions against the ground truth and update the leaderboard."
157
-
158
- demo = gr.Blocks()
159
 
160
- with demo:
 
161
  gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
162
- with gr.Tab("Evaluate"):
163
- file_input = gr.File(label="Upload Prediction CSV")
164
- eval_status = gr.Textbox(label="Evaluation Status")
165
- eval_results_file = gr.File(label="Download Evaluation Results")
166
- eval_button = gr.Button("Evaluate")
167
- eval_button.click(
168
- evaluate_predictions, inputs=file_input, outputs=[eval_status, eval_results_file]
169
- )
170
- with gr.Tab("Leaderboard"):
171
- leaderboard_text = gr.Textbox(label="Leaderboard", interactive=False)
172
- refresh_button = gr.Button("Refresh Leaderboard")
173
- refresh_button.click(display_leaderboard, outputs=leaderboard_text)
174
-
175
- if __name__ == "__main__":
176
- demo.launch()
 
 
 
 
 
 
 
 
1
+ # import gradio as gr
2
+ # import pandas as pd
3
+ # import os
4
+ # import re
5
+ # from datetime import datetime
6
+
7
+ # LEADERBOARD_FILE = "leaderboard.csv" # File to store leaderboard data
8
+
9
+ # def clean_answer(answer):
10
+ # if pd.isna(answer):
11
+ # return None
12
+ # answer = str(answer)
13
+ # clean = re.sub(r'[^A-Da-d]', '', answer)
14
+ # if clean:
15
+ # first_letter = clean[0].upper()
16
+ # if first_letter in ['A', 'B', 'C', 'D']:
17
+ # return first_letter
18
+ # return None
19
+
20
+ # def write_evaluation_results(results, output_file):
21
+ # os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
22
+ # timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
23
+
24
+ # output_text = [
25
+ # f"Evaluation Results for Model: {results['model_name']}",
26
+ # f"Timestamp: {timestamp}",
27
+ # "-" * 50,
28
+ # f"Overall Accuracy (including invalid): {results['overall_accuracy']:.2%}",
29
+ # f"Accuracy (valid predictions only): {results['valid_accuracy']:.2%}",
30
+ # f"Total Questions: {results['total_questions']}",
31
+ # f"Valid Predictions: {results['valid_predictions']}",
32
+ # f"Invalid/Malformed Predictions: {results['invalid_predictions']}",
33
+ # f"Correct Predictions: {results['correct_predictions']}",
34
+ # "\nPerformance by Field:",
35
+ # "-" * 50
36
+ # ]
37
+
38
+ # for field, metrics in results['field_performance'].items():
39
+ # field_results = [
40
+ # f"\nField: {field}",
41
+ # f"Accuracy (including invalid): {metrics['accuracy']:.2%}",
42
+ # f"Accuracy (valid only): {metrics['valid_accuracy']:.2%}",
43
+ # f"Correct: {metrics['correct']}/{metrics['total']}",
44
+ # f"Invalid predictions: {metrics['invalid']}"
45
+ # ]
46
+ # output_text.extend(field_results)
47
+
48
+ # with open(output_file, 'w') as f:
49
+ # f.write('\n'.join(output_text))
50
+ # print('\n'.join(output_text))
51
+ # print(f"\nResults have been saved to: {output_file}")
52
+
53
+ # def update_leaderboard(results):
54
+ # # Add results to the leaderboard file
55
+ # new_entry = {
56
+ # "Model Name": results['model_name'],
57
+ # "Overall Accuracy": f"{results['overall_accuracy']:.2%}",
58
+ # "Valid Accuracy": f"{results['valid_accuracy']:.2%}",
59
+ # "Correct Predictions": results['correct_predictions'],
60
+ # "Total Questions": results['total_questions'],
61
+ # "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
62
+ # }
63
+ # leaderboard_df = pd.DataFrame([new_entry])
64
+ # if os.path.exists(LEADERBOARD_FILE):
65
+ # existing_df = pd.read_csv(LEADERBOARD_FILE)
66
+ # leaderboard_df = pd.concat([existing_df, leaderboard_df], ignore_index=True)
67
+ # leaderboard_df.to_csv(LEADERBOARD_FILE, index=False)
68
+
69
+ # def display_leaderboard():
70
+ # if not os.path.exists(LEADERBOARD_FILE):
71
+ # return "Leaderboard is empty."
72
+ # leaderboard_df = pd.read_csv(LEADERBOARD_FILE)
73
+ # return leaderboard_df.to_markdown(index=False)
74
+
75
+ # def evaluate_predictions(prediction_file):
76
+ # ground_truth_file = "ground_truth.csv" # Specify the path to the ground truth file
77
+ # if not prediction_file:
78
+ # return "Prediction file not uploaded", None
79
+
80
+ # if not os.path.exists(ground_truth_file):
81
+ # return "Ground truth file not found", None
82
+
83
+ # try:
84
+ # predictions_df = pd.read_csv(prediction_file.name)
85
+ # ground_truth_df = pd.read_csv(ground_truth_file)
86
+
87
+ # # Extract model name
88
+ # try:
89
+ # filename = os.path.basename(prediction_file.name)
90
+ # if "_" in filename and "." in filename:
91
+ # model_name = filename.split('_')[1].split('.')[0]
92
+ # else:
93
+ # model_name = "unknown_model"
94
+ # except IndexError:
95
+ # model_name = "unknown_model"
96
+
97
+ # # Merge dataframes
98
+ # merged_df = pd.merge(
99
+ # predictions_df,
100
+ # ground_truth_df,
101
+ # on='question_id',
102
+ # how='inner'
103
+ # )
104
+ # merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
105
+ # invalid_predictions = merged_df['pred_answer'].isna().sum()
106
+ # valid_predictions = merged_df.dropna(subset=['pred_answer'])
107
+ # correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
108
+ # total_predictions = len(merged_df)
109
+ # total_valid_predictions = len(valid_predictions)
110
+
111
+ # overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
112
+ # valid_accuracy = (
113
+ # correct_predictions / total_valid_predictions
114
+ # if total_valid_predictions > 0
115
+ # else 0
116
+ # )
117
+
118
+ # field_metrics = {}
119
+ # for field in merged_df['Field'].unique():
120
+ # field_data = merged_df[merged_df['Field'] == field]
121
+ # field_valid_data = field_data.dropna(subset=['pred_answer'])
122
+
123
+ # field_correct = (field_valid_data['pred_answer'] == field_valid_data['Answer']).sum()
124
+ # field_total = len(field_data)
125
+ # field_valid_total = len(field_valid_data)
126
+ # field_invalid = field_total - field_valid_total
127
+
128
+ # field_metrics[field] = {
129
+ # 'accuracy': field_correct / field_total if field_total > 0 else 0,
130
+ # 'valid_accuracy': field_correct / field_valid_total if field_valid_total > 0 else 0,
131
+ # 'correct': field_correct,
132
+ # 'total': field_total,
133
+ # 'invalid': field_invalid
134
+ # }
135
+
136
+ # results = {
137
+ # 'model_name': model_name,
138
+ # 'overall_accuracy': overall_accuracy,
139
+ # 'valid_accuracy': valid_accuracy,
140
+ # 'total_questions': total_predictions,
141
+ # 'valid_predictions': total_valid_predictions,
142
+ # 'invalid_predictions': invalid_predictions,
143
+ # 'correct_predictions': correct_predictions,
144
+ # 'field_performance': field_metrics
145
+ # }
146
+
147
+ # update_leaderboard(results)
148
+ # output_file = "evaluation_results.txt"
149
+ # write_evaluation_results(results, output_file)
150
+ # return "Evaluation completed successfully! Leaderboard updated.", output_file
151
+
152
+ # except Exception as e:
153
+ # return f"Error during evaluation: {str(e)}", None
154
+
155
+ # # Gradio Interface
156
+ # description = "Upload a prediction CSV file to evaluate predictions against the ground truth and update the leaderboard."
157
+
158
+ # demo = gr.Blocks()
159
+
160
+ # with demo:
161
+ # gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
162
+ # with gr.Tab("Evaluate"):
163
+ # file_input = gr.File(label="Upload Prediction CSV")
164
+ # eval_status = gr.Textbox(label="Evaluation Status")
165
+ # eval_results_file = gr.File(label="Download Evaluation Results")
166
+ # eval_button = gr.Button("Evaluate")
167
+ # eval_button.click(
168
+ # evaluate_predictions, inputs=file_input, outputs=[eval_status, eval_results_file]
169
+ # )
170
+ # with gr.Tab("Leaderboard"):
171
+ # leaderboard_text = gr.Textbox(label="Leaderboard", interactive=False)
172
+ # refresh_button = gr.Button("Refresh Leaderboard")
173
+ # refresh_button.click(display_leaderboard, outputs=leaderboard_text)
174
+
175
+ # if __name__ == "__main__":
176
+ # demo.launch()
177
+
178
+
179
  import gradio as gr
180
  import pandas as pd
181
  import os
 
183
  from datetime import datetime
184
 
185
  LEADERBOARD_FILE = "leaderboard.csv" # File to store leaderboard data
186
+ LAST_UPDATED = datetime.now().strftime("%B %d, %Y")
187
 
188
  def clean_answer(answer):
189
  if pd.isna(answer):
 
191
  answer = str(answer)
192
  clean = re.sub(r'[^A-Da-d]', '', answer)
193
  if clean:
194
+ return clean[0].upper()
 
 
195
  return None
196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  def update_leaderboard(results):
 
198
  new_entry = {
199
  "Model Name": results['model_name'],
200
+ "Overall Accuracy": round(results['overall_accuracy'] * 100, 2),
201
+ "Valid Accuracy": round(results['valid_accuracy'] * 100, 2),
202
  "Correct Predictions": results['correct_predictions'],
203
  "Total Questions": results['total_questions'],
204
+ "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
205
  }
206
  leaderboard_df = pd.DataFrame([new_entry])
207
  if os.path.exists(LEADERBOARD_FILE):
 
209
  leaderboard_df = pd.concat([existing_df, leaderboard_df], ignore_index=True)
210
  leaderboard_df.to_csv(LEADERBOARD_FILE, index=False)
211
 
 
 
 
 
 
 
212
  def evaluate_predictions(prediction_file):
213
+ ground_truth_file = "ground_truth.csv"
 
 
 
214
  if not os.path.exists(ground_truth_file):
215
+ return "Ground truth file not found."
216
+ if not prediction_file:
217
+ return "Prediction file not uploaded."
218
 
219
  try:
220
  predictions_df = pd.read_csv(prediction_file.name)
221
  ground_truth_df = pd.read_csv(ground_truth_file)
222
+ model_name = os.path.basename(prediction_file.name).split('_')[1].split('.')[0]
223
+
224
+ merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
226
+
227
  valid_predictions = merged_df.dropna(subset=['pred_answer'])
228
  correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
229
  total_predictions = len(merged_df)
230
  total_valid_predictions = len(valid_predictions)
231
 
232
  overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
233
+ valid_accuracy = correct_predictions / total_valid_predictions if total_valid_predictions > 0 else 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
 
235
  results = {
236
  'model_name': model_name,
237
  'overall_accuracy': overall_accuracy,
238
  'valid_accuracy': valid_accuracy,
 
 
 
239
  'correct_predictions': correct_predictions,
240
+ 'total_questions': total_predictions,
241
  }
242
 
243
  update_leaderboard(results)
244
+ return "Evaluation completed successfully! Leaderboard updated."
 
 
 
245
  except Exception as e:
246
+ return f"Error during evaluation: {str(e)}"
247
 
248
+ def load_leaderboard():
249
+ if not os.path.exists(LEADERBOARD_FILE):
250
+ return pd.DataFrame({"Message": ["Leaderboard is empty."]})
251
+ return pd.read_csv(LEADERBOARD_FILE)
252
 
253
+ # Build Gradio App
254
+ with gr.Blocks() as demo:
255
  gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
256
+ with gr.Tabs():
257
+ with gr.TabItem("🏅 Submission"):
258
+ file_input = gr.File(label="Upload Prediction CSV")
259
+ eval_status = gr.Textbox(label="Evaluation Status", interactive=False)
260
+ eval_button = gr.Button("Evaluate and Update Leaderboard")
261
+ eval_button.click(
262
+ evaluate_predictions,
263
+ inputs=[file_input],
264
+ outputs=[eval_status],
265
+ )
266
+ with gr.TabItem("🏅 Leaderboard"):
267
+ leaderboard_table = gr.Dataframe(
268
+ value=load_leaderboard(),
269
+ label="Leaderboard",
270
+ interactive=False,
271
+ wrap=True,
272
+ )
273
+
274
+ gr.Markdown(f"Last updated on **{LAST_UPDATED}**")
275
+
276
+ demo.launch()
277
+