rohansampath commited on
Commit
cb27368
·
verified ·
1 Parent(s): 58140dc

Update dataset_previews.py

Browse files
Files changed (1) hide show
  1. dataset_previews.py +138 -70
dataset_previews.py CHANGED
@@ -1,26 +1,62 @@
1
  import os
2
  import json
3
  import pandas as pd
4
- from typing import Dict, Any
 
 
 
5
 
6
- def calculate_evaluation_time(num_questions: int, input_tokens: int, generated_tokens_per_question: int = 300):
7
- """Calculate approximate evaluation time based on token counts and throughput assumptions"""
8
- # Constants
9
- PROMPT_THROUGHPUT = 5000 # tokens per second
10
- GENERATION_THROUGHPUT = 500 # tokens per second
11
- OVERHEAD_MINUTES = 2 # Fixed overhead for model loading, etc.
12
-
13
- # Calculate total generated tokens
14
- total_generated_tokens = num_questions * generated_tokens_per_question
15
-
16
- # Calculate time components (in seconds)
17
- prompt_time = input_tokens / PROMPT_THROUGHPUT
18
- generation_time = total_generated_tokens / GENERATION_THROUGHPUT
19
-
20
- # Total time in minutes
21
- total_time_minutes = (prompt_time + generation_time) / 60 + OVERHEAD_MINUTES
22
 
23
- return total_time_minutes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  def mmlupro_dataset_preview() -> Dict[str, Any]:
26
  """
@@ -30,6 +66,7 @@ def mmlupro_dataset_preview() -> Dict[str, Any]:
30
  Dict[str, Any]: Dictionary containing dataset information
31
  """
32
  preview_file = "/data/mmlu_pro_dataset_preview_table.json"
 
33
  # Check if preview file exists
34
  if os.path.exists(preview_file):
35
  try:
@@ -42,51 +79,71 @@ def mmlupro_dataset_preview() -> Dict[str, Any]:
42
  # If file exists but can't be read, regenerate it
43
 
44
  # Generate preview data if file doesn't exist or couldn't be read
45
- num_questions = 12032
46
- input_tokens = 12642105
47
- generated_tokens_per_question = 300
48
-
49
- # Calculate evaluation time
50
- eval_time_minutes = calculate_evaluation_time(
51
- num_questions,
52
- input_tokens,
53
- generated_tokens_per_question
54
- )
55
-
56
- # Create preview data
57
- preview_data = {
58
- "dataset_name": "MMLU-Pro",
59
- "evaluation_type": "Multiple Choice",
60
- "description": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original. A higher score is a better score.",
61
- "links": {
62
- "huggingface": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
63
- "github": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
64
- "paper": "https://arxiv.org/abs/2406.01574"
65
- },
66
- "organization": "Questions are organized into 14 subjects. Each subject has 5 validation questions (for a total of 70). The 5 validation questions serve as 5-shot prompts for each evaluation question.",
67
- "num_questions": num_questions,
68
- "input_tokens": input_tokens,
69
- "evaluation_time": {
70
- "generated_tokens_per_question": generated_tokens_per_question,
71
- "total_generated_tokens": num_questions * generated_tokens_per_question,
72
- "prompt_throughput": 5000,
73
- "generation_throughput": 500,
74
- "total_time_minutes": round(eval_time_minutes, 2)
75
- }
76
- }
77
-
78
- # Save preview data to file
79
  try:
80
- with open(preview_file, 'w') as f:
81
- json.dump(preview_data, f, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  except Exception as e:
83
- print(f"Error writing preview file: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  return preview_data
86
 
87
  def format_preview_for_display(preview_data: Dict[str, Any]) -> pd.DataFrame:
88
  """
89
- Format the preview data into a DataFrame for display in Gradio
90
 
91
  Args:
92
  preview_data (Dict[str, Any]): Dataset preview information
@@ -94,20 +151,31 @@ def format_preview_for_display(preview_data: Dict[str, Any]) -> pd.DataFrame:
94
  Returns:
95
  pd.DataFrame: Formatted data for display
96
  """
97
- # Create a table format with keys and values
 
 
 
 
 
 
 
 
98
  rows = [
99
- {"Key": "Dataset Name", "Value": preview_data["dataset_name"]},
100
- {"Key": "Evaluation Type", "Value": preview_data["evaluation_type"]},
101
- {"Key": "Description", "Value": preview_data["description"]},
102
- {"Key": "Links", "Value": (
103
- f"Dataset: {preview_data['links']['huggingface']}\n"
104
- f"GitHub: {preview_data['links']['github']}\n"
105
- f"Paper: {preview_data['links']['paper']}"
106
- )},
107
- {"Key": "Organization", "Value": preview_data["organization"]},
108
- {"Key": "Number of Questions", "Value": preview_data["num_questions"]},
109
- {"Key": "Number of Input Tokens", "Value": preview_data["input_tokens"]},
110
- {"Key": "Estimated Evaluation Time", "Value": f"{preview_data['evaluation_time']['total_time_minutes']} minutes (for 2 models on A100)"}
111
  ]
112
 
113
- return pd.DataFrame(rows)
 
 
 
 
 
 
 
 
 
1
  import os
2
  import json
3
  import pandas as pd
4
+ import numpy as np
5
+ from typing import Dict, Any, List, Tuple
6
+ import collections
7
+ from mmlu_pro_eval_adapted import load_mmlu_pro_data, preprocess # Import preprocess also
8
 
9
+ def calculate_dataset_statistics():
10
+ """
11
+ Calculate detailed statistics about the MMLU-Pro dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ Returns:
14
+ Dict: Dictionary containing dataset statistics
15
+ """
16
+ try:
17
+ # Load MMLU-Pro data using the function from mmlu_pro_eval_adapted
18
+ mmlu_data = load_mmlu_pro_data(num_subjects=-1, num_questions=-1)
19
+
20
+ # Calculate total questions and questions per subject
21
+ total_questions = 0
22
+ subject_counts = {}
23
+
24
+ # Count options per question
25
+ options_counts = []
26
+
27
+ for subject_name, subject_data in mmlu_data.items():
28
+ num_questions = len(subject_data["test_examples"])
29
+ subject_counts[subject_name] = num_questions
30
+ total_questions += num_questions
31
+
32
+ # Count options for each question
33
+ for test_example in subject_data["test_examples"]:
34
+ options_counts.append(len(test_example["options"]))
35
+
36
+ max_options = max(options_counts)
37
+ avg_options = sum(options_counts) / len(options_counts)
38
+
39
+ # Count questions with each number of options
40
+ options_distribution = collections.Counter(options_counts)
41
+
42
+ return {
43
+ "total_questions": total_questions,
44
+ "subject_counts": subject_counts,
45
+ "max_options": max_options,
46
+ "avg_options": avg_options,
47
+ "options_distribution": options_distribution
48
+ }
49
+
50
+ except Exception as e:
51
+ print(f"Error calculating dataset statistics: {e}")
52
+ # Fallback values if calculation fails
53
+ return {
54
+ "total_questions": 12032,
55
+ "subject_counts": {"Total": 12032},
56
+ "max_options": 10,
57
+ "avg_options": 10.0,
58
+ "options_distribution": {10: 12032}
59
+ }
60
 
61
  def mmlupro_dataset_preview() -> Dict[str, Any]:
62
  """
 
66
  Dict[str, Any]: Dictionary containing dataset information
67
  """
68
  preview_file = "/data/mmlu_pro_dataset_preview_table.json"
69
+
70
  # Check if preview file exists
71
  if os.path.exists(preview_file):
72
  try:
 
79
  # If file exists but can't be read, regenerate it
80
 
81
  # Generate preview data if file doesn't exist or couldn't be read
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  try:
83
+ # Calculate dataset statistics
84
+ stats = calculate_dataset_statistics()
85
+
86
+ # Format subject counts as a string, in descending order
87
+ sorted_subjects = sorted(stats["subject_counts"].items(), key=lambda x: x[1], reverse=True)
88
+ subject_counts_str = f"Total: {stats['total_questions']}\n"
89
+ for subject, count in sorted_subjects:
90
+ subject_counts_str += f"{subject}: {count}\n"
91
+ subject_counts_str = subject_counts_str.strip()
92
+
93
+ # Format options distribution as a string
94
+ options_dist_str = f"Maximum: {stats['max_options']}\nAverage: {stats['avg_options']:.2f}\n"
95
+ sorted_options = sorted(stats["options_distribution"].items(), key=lambda x: x[0], reverse=True)
96
+ for num_options, count in sorted_options:
97
+ options_dist_str += f"{num_options}-choices: {count}, "
98
+ options_dist_str = options_dist_str.rstrip(", ")
99
+
100
+ # Create preview data
101
+ preview_data = {
102
+ "dataset_name": "MMLU-Pro",
103
+ "evaluation_type": "Multiple Choice",
104
+ "description": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original. A higher score is a better score.",
105
+ "links": {
106
+ "huggingface": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
107
+ "github": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
108
+ "paper": "https://arxiv.org/abs/2406.01574"
109
+ },
110
+ "organization": "Questions are organized into 14 subjects. Each subject has 5 validation questions (for a total of 70). The 5 validation questions serve as 5-shot prompts for each evaluation question.",
111
+ "num_questions": subject_counts_str,
112
+ "choices_per_question": options_dist_str
113
+ }
114
+
115
+ # Save preview data to file
116
+ try:
117
+ with open(preview_file, 'w') as f:
118
+ json.dump(preview_data, f, indent=2)
119
+ except Exception as e:
120
+ print(f"Error writing preview file: {e}")
121
+
122
  except Exception as e:
123
+ # If calculation fails, fall back to hardcoded values
124
+ print(f"Error calculating dynamic values: {e}")
125
+ # Hardcoded fallback values
126
+ num_questions = 12032
127
+
128
+ preview_data = {
129
+ "dataset_name": "MMLU-Pro",
130
+ "evaluation_type": "Multiple Choice",
131
+ "description": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original. A higher score is a better score.",
132
+ "links": {
133
+ "huggingface": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
134
+ "github": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
135
+ "paper": "https://arxiv.org/abs/2406.01574"
136
+ },
137
+ "organization": "Questions are organized into 14 subjects. Each subject has 5 validation questions (for a total of 70). The 5 validation questions serve as 5-shot prompts for each evaluation question.",
138
+ "num_questions": f"Total: {num_questions} (Note: Using fallback value)",
139
+ "choices_per_question": "Maximum: 10\nAverage: 10.0\n10-choices: 12032"
140
+ }
141
 
142
  return preview_data
143
 
144
  def format_preview_for_display(preview_data: Dict[str, Any]) -> pd.DataFrame:
145
  """
146
+ Format the preview data with improved readability for display in Gradio
147
 
148
  Args:
149
  preview_data (Dict[str, Any]): Dataset preview information
 
151
  Returns:
152
  pd.DataFrame: Formatted data for display
153
  """
154
+ # Create links with bullet points
155
+ links_value = (
156
+ f"Dataset: {preview_data['links']['huggingface']}\n"
157
+ f"GitHub: {preview_data['links']['github']}\n"
158
+ f"Paper: {preview_data['links']['paper']}"
159
+ )
160
+ links_formatted = "• " + "\n• ".join(links_value.split('\n'))
161
+
162
+ # Create a table format with better column names
163
  rows = [
164
+ {"Dataset Property": "Dataset Name", "Details": preview_data["dataset_name"]},
165
+ {"Dataset Property": "Evaluation Type", "Details": preview_data["evaluation_type"]},
166
+ {"Dataset Property": "Description", "Details": preview_data["description"]},
167
+ {"Dataset Property": "Links", "Details": links_formatted},
168
+ {"Dataset Property": "Organization", "Details": preview_data["organization"]},
169
+ {"Dataset Property": "Number of Questions", "Details": preview_data["num_questions"]},
170
+ {"Dataset Property": "Choices per Question", "Details": preview_data["choices_per_question"]}
 
 
 
 
 
171
  ]
172
 
173
+ return pd.DataFrame(rows)
174
+
175
+ # For standalone testing
176
+ if __name__ == "__main__":
177
+ preview_data = mmlupro_dataset_preview()
178
+ print("Preview data generated:")
179
+ for key, value in preview_data.items():
180
+ if key != "links":
181
+ print(f"\n{key}:\n{value}")