Spaces:
Sleeping
Sleeping
Update dataset_previews.py
Browse files- dataset_previews.py +9 -21
dataset_previews.py
CHANGED
@@ -17,33 +17,21 @@ def calculate_dataset_statistics():
|
|
17 |
# Load MMLU-Pro data using the function from mmlu_pro_eval_adapted
|
18 |
test_df, val_df = load_mmlu_pro()
|
19 |
|
|
|
20 |
test_df = test_df.sort_values(['category', 'question_id'])
|
21 |
-
|
22 |
-
all_subjects = sorted(test_df['category'].unique())
|
23 |
|
24 |
-
# Calculate total questions
|
25 |
total_questions = len(test_df)
|
26 |
-
subject_counts = {}
|
27 |
-
|
28 |
-
# Count options per question
|
29 |
-
options_counts = []
|
30 |
-
|
31 |
-
for subject in all_subjects:
|
32 |
-
print("Subject", subject)
|
33 |
-
test_samples = test_df[test_df['category'] == subject]
|
34 |
-
num_questions = len(test_samples)
|
35 |
-
subject_counts[subject] = num_questions
|
36 |
-
print("First sample", test_samples.head(1), "\t Num Questions:", num_questions)
|
37 |
-
|
38 |
-
# Count options for each question
|
39 |
-
for sample in test_samples:
|
40 |
-
print ("SAMPLE", sample)
|
41 |
-
options_counts.append(len(sample['options']))
|
42 |
|
|
|
|
|
|
|
|
|
|
|
43 |
max_options = max(options_counts)
|
44 |
avg_options = sum(options_counts) / len(options_counts)
|
45 |
-
|
46 |
-
# Count
|
47 |
options_distribution = collections.Counter(options_counts)
|
48 |
|
49 |
return {
|
|
|
17 |
# Load MMLU-Pro data using the function from mmlu_pro_eval_adapted
|
18 |
test_df, val_df = load_mmlu_pro()
|
19 |
|
20 |
+
# Ensure consistent ordering
|
21 |
test_df = test_df.sort_values(['category', 'question_id'])
|
|
|
|
|
22 |
|
23 |
+
# Calculate total questions
|
24 |
total_questions = len(test_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
# Calculate subject-wise question counts efficiently
|
27 |
+
subject_counts = test_df['category'].value_counts().to_dict()
|
28 |
+
|
29 |
+
# Count options per question efficiently using `.apply()`
|
30 |
+
options_counts = test_df['options'].apply(len).tolist()
|
31 |
max_options = max(options_counts)
|
32 |
avg_options = sum(options_counts) / len(options_counts)
|
33 |
+
|
34 |
+
# Count frequency of each option count
|
35 |
options_distribution = collections.Counter(options_counts)
|
36 |
|
37 |
return {
|