rohansampath commited on
Commit
431920c
·
verified ·
1 Parent(s): f70cbf3

Update dataset_previews.py

Browse files
Files changed (1) hide show
  1. dataset_previews.py +9 -21
dataset_previews.py CHANGED
@@ -17,33 +17,21 @@ def calculate_dataset_statistics():
17
  # Load MMLU-Pro data using the function from mmlu_pro_eval_adapted
18
  test_df, val_df = load_mmlu_pro()
19
 
 
20
  test_df = test_df.sort_values(['category', 'question_id'])
21
-
22
- all_subjects = sorted(test_df['category'].unique())
23
 
24
- # Calculate total questions and questions per subject
25
  total_questions = len(test_df)
26
- subject_counts = {}
27
-
28
- # Count options per question
29
- options_counts = []
30
-
31
- for subject in all_subjects:
32
- print("Subject", subject)
33
- test_samples = test_df[test_df['category'] == subject]
34
- num_questions = len(test_samples)
35
- subject_counts[subject] = num_questions
36
- print("First sample", test_samples.head(1), "\t Num Questions:", num_questions)
37
-
38
- # Count options for each question
39
- for sample in test_samples:
40
- print ("SAMPLE", sample)
41
- options_counts.append(len(sample['options']))
42
 
 
 
 
 
 
43
  max_options = max(options_counts)
44
  avg_options = sum(options_counts) / len(options_counts)
45
-
46
- # Count questions with each number of options
47
  options_distribution = collections.Counter(options_counts)
48
 
49
  return {
 
17
  # Load MMLU-Pro data using the function from mmlu_pro_eval_adapted
18
  test_df, val_df = load_mmlu_pro()
19
 
20
+ # Ensure consistent ordering
21
  test_df = test_df.sort_values(['category', 'question_id'])
 
 
22
 
23
+ # Calculate total questions
24
  total_questions = len(test_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+ # Calculate subject-wise question counts efficiently
27
+ subject_counts = test_df['category'].value_counts().to_dict()
28
+
29
+ # Count options per question efficiently using `.apply()`
30
+ options_counts = test_df['options'].apply(len).tolist()
31
  max_options = max(options_counts)
32
  avg_options = sum(options_counts) / len(options_counts)
33
+
34
+ # Count frequency of each option count
35
  options_distribution = collections.Counter(options_counts)
36
 
37
  return {