H2H-eval-comparator

Sleeping

App Files Files Community

rohansampath commited on Feb 26

Commit

cb27368

verified ·

1 Parent(s): 58140dc

Update dataset_previews.py

Browse files

Files changed (1) hide show

dataset_previews.py +138 -70

dataset_previews.py CHANGED Viewed

@@ -1,26 +1,62 @@
 import os
 import json
 import pandas as pd
-from typing import Dict, Any
-def calculate_evaluation_time(num_questions: int, input_tokens: int, generated_tokens_per_question: int = 300):
-    """Calculate approximate evaluation time based on token counts and throughput assumptions"""
-    # Constants
-    PROMPT_THROUGHPUT = 5000  # tokens per second
-    GENERATION_THROUGHPUT = 500  # tokens per second
-    OVERHEAD_MINUTES = 2  # Fixed overhead for model loading, etc.
-    # Calculate total generated tokens
-    total_generated_tokens = num_questions * generated_tokens_per_question
-    # Calculate time components (in seconds)
-    prompt_time = input_tokens / PROMPT_THROUGHPUT
-    generation_time = total_generated_tokens / GENERATION_THROUGHPUT
-    # Total time in minutes
-    total_time_minutes = (prompt_time + generation_time) / 60 + OVERHEAD_MINUTES
-    return total_time_minutes
 def mmlupro_dataset_preview() -> Dict[str, Any]:
     """
@@ -30,6 +66,7 @@ def mmlupro_dataset_preview() -> Dict[str, Any]:
         Dict[str, Any]: Dictionary containing dataset information
     """
     preview_file = "/data/mmlu_pro_dataset_preview_table.json"
     # Check if preview file exists
     if os.path.exists(preview_file):
         try:
@@ -42,51 +79,71 @@ def mmlupro_dataset_preview() -> Dict[str, Any]:
             # If file exists but can't be read, regenerate it
     # Generate preview data if file doesn't exist or couldn't be read
-    num_questions = 12032
-    input_tokens = 12642105
-    generated_tokens_per_question = 300
-    # Calculate evaluation time
-    eval_time_minutes = calculate_evaluation_time(
-        num_questions,
-        input_tokens,
-        generated_tokens_per_question
-    )
-    # Create preview data
-    preview_data = {
-        "dataset_name": "MMLU-Pro",
-        "evaluation_type": "Multiple Choice",
-        "description": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original. A higher score is a better score.",
-        "links": {
-            "huggingface": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
-            "github": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
-            "paper": "https://arxiv.org/abs/2406.01574"
-        },
-        "organization": "Questions are organized into 14 subjects. Each subject has 5 validation questions (for a total of 70). The 5 validation questions serve as 5-shot prompts for each evaluation question.",
-        "num_questions": num_questions,
-        "input_tokens": input_tokens,
-        "evaluation_time": {
-            "generated_tokens_per_question": generated_tokens_per_question,
-            "total_generated_tokens": num_questions * generated_tokens_per_question,
-            "prompt_throughput": 5000,
-            "generation_throughput": 500,
-            "total_time_minutes": round(eval_time_minutes, 2)
-        }
-    }
-    # Save preview data to file
     try:
-        with open(preview_file, 'w') as f:
-            json.dump(preview_data, f, indent=2)
     except Exception as e:
-        print(f"Error writing preview file: {e}")
     return preview_data
 def format_preview_for_display(preview_data: Dict[str, Any]) -> pd.DataFrame:
     """
-    Format the preview data into a DataFrame for display in Gradio
     Args:
         preview_data (Dict[str, Any]): Dataset preview information
@@ -94,20 +151,31 @@ def format_preview_for_display(preview_data: Dict[str, Any]) -> pd.DataFrame:
     Returns:
         pd.DataFrame: Formatted data for display
     """
-    # Create a table format with keys and values
     rows = [
-        {"Key": "Dataset Name", "Value": preview_data["dataset_name"]},
-        {"Key": "Evaluation Type", "Value": preview_data["evaluation_type"]},
-        {"Key": "Description", "Value": preview_data["description"]},
-        {"Key": "Links", "Value": (
-            f"Dataset: {preview_data['links']['huggingface']}\n"
-            f"GitHub: {preview_data['links']['github']}\n"
-            f"Paper: {preview_data['links']['paper']}"
-        )},
-        {"Key": "Organization", "Value": preview_data["organization"]},
-        {"Key": "Number of Questions", "Value": preview_data["num_questions"]},
-        {"Key": "Number of Input Tokens", "Value": preview_data["input_tokens"]},
-        {"Key": "Estimated Evaluation Time", "Value": f"{preview_data['evaluation_time']['total_time_minutes']} minutes (for 2 models on A100)"}
     ]
-    return pd.DataFrame(rows)

 import os
 import json
 import pandas as pd
+import numpy as np
+from typing import Dict, Any, List, Tuple
+import collections
+from mmlu_pro_eval_adapted import load_mmlu_pro_data, preprocess  # Import preprocess also
+def calculate_dataset_statistics():
+    """
+    Calculate detailed statistics about the MMLU-Pro dataset
+    Returns:
+        Dict: Dictionary containing dataset statistics
+    """
+    try:
+        # Load MMLU-Pro data using the function from mmlu_pro_eval_adapted
+        mmlu_data = load_mmlu_pro_data(num_subjects=-1, num_questions=-1)
+        # Calculate total questions and questions per subject
+        total_questions = 0
+        subject_counts = {}
+        # Count options per question
+        options_counts = []
+        for subject_name, subject_data in mmlu_data.items():
+            num_questions = len(subject_data["test_examples"])
+            subject_counts[subject_name] = num_questions
+            total_questions += num_questions
+            # Count options for each question
+            for test_example in subject_data["test_examples"]:
+                options_counts.append(len(test_example["options"]))
+        max_options = max(options_counts)
+        avg_options = sum(options_counts) / len(options_counts)
+        # Count questions with each number of options
+        options_distribution = collections.Counter(options_counts)
+        return {
+            "total_questions": total_questions,
+            "subject_counts": subject_counts,
+            "max_options": max_options,
+            "avg_options": avg_options,
+            "options_distribution": options_distribution
+        }
+    except Exception as e:
+        print(f"Error calculating dataset statistics: {e}")
+        # Fallback values if calculation fails
+        return {
+            "total_questions": 12032,
+            "subject_counts": {"Total": 12032},
+            "max_options": 10,
+            "avg_options": 10.0,
+            "options_distribution": {10: 12032}
+        }
 def mmlupro_dataset_preview() -> Dict[str, Any]:
     """
         Dict[str, Any]: Dictionary containing dataset information
     """
     preview_file = "/data/mmlu_pro_dataset_preview_table.json"
     # Check if preview file exists
     if os.path.exists(preview_file):
         try:
             # If file exists but can't be read, regenerate it
     # Generate preview data if file doesn't exist or couldn't be read
     try:
+        # Calculate dataset statistics
+        stats = calculate_dataset_statistics()
+        # Format subject counts as a string, in descending order
+        sorted_subjects = sorted(stats["subject_counts"].items(), key=lambda x: x[1], reverse=True)
+        subject_counts_str = f"Total: {stats['total_questions']}\n"
+        for subject, count in sorted_subjects:
+            subject_counts_str += f"{subject}: {count}\n"
+        subject_counts_str = subject_counts_str.strip()
+        # Format options distribution as a string
+        options_dist_str = f"Maximum: {stats['max_options']}\nAverage: {stats['avg_options']:.2f}\n"
+        sorted_options = sorted(stats["options_distribution"].items(), key=lambda x: x[0], reverse=True)
+        for num_options, count in sorted_options:
+            options_dist_str += f"{num_options}-choices: {count}, "
+        options_dist_str = options_dist_str.rstrip(", ")
+        # Create preview data
+        preview_data = {
+            "dataset_name": "MMLU-Pro",
+            "evaluation_type": "Multiple Choice",
+            "description": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original. A higher score is a better score.",
+            "links": {
+                "huggingface": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
+                "github": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+                "paper": "https://arxiv.org/abs/2406.01574"
+            },
+            "organization": "Questions are organized into 14 subjects. Each subject has 5 validation questions (for a total of 70). The 5 validation questions serve as 5-shot prompts for each evaluation question.",
+            "num_questions": subject_counts_str,
+            "choices_per_question": options_dist_str
+        }
+        # Save preview data to file
+        try:
+            with open(preview_file, 'w') as f:
+                json.dump(preview_data, f, indent=2)
+        except Exception as e:
+            print(f"Error writing preview file: {e}")
     except Exception as e:
+        # If calculation fails, fall back to hardcoded values
+        print(f"Error calculating dynamic values: {e}")
+        # Hardcoded fallback values
+        num_questions = 12032
+        preview_data = {
+            "dataset_name": "MMLU-Pro",
+            "evaluation_type": "Multiple Choice",
+            "description": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original. A higher score is a better score.",
+            "links": {
+                "huggingface": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro",
+                "github": "https://github.com/TIGER-AI-Lab/MMLU-Pro",
+                "paper": "https://arxiv.org/abs/2406.01574"
+            },
+            "organization": "Questions are organized into 14 subjects. Each subject has 5 validation questions (for a total of 70). The 5 validation questions serve as 5-shot prompts for each evaluation question.",
+            "num_questions": f"Total: {num_questions} (Note: Using fallback value)",
+            "choices_per_question": "Maximum: 10\nAverage: 10.0\n10-choices: 12032"
+        }
     return preview_data
 def format_preview_for_display(preview_data: Dict[str, Any]) -> pd.DataFrame:
     """
+    Format the preview data with improved readability for display in Gradio
     Args:
         preview_data (Dict[str, Any]): Dataset preview information
     Returns:
         pd.DataFrame: Formatted data for display
     """
+    # Create links with bullet points
+    links_value = (
+        f"Dataset: {preview_data['links']['huggingface']}\n"
+        f"GitHub: {preview_data['links']['github']}\n"
+        f"Paper: {preview_data['links']['paper']}"
+    )
+    links_formatted = "• " + "\n• ".join(links_value.split('\n'))
+    # Create a table format with better column names
     rows = [
+        {"Dataset Property": "Dataset Name", "Details": preview_data["dataset_name"]},
+        {"Dataset Property": "Evaluation Type", "Details": preview_data["evaluation_type"]},
+        {"Dataset Property": "Description", "Details": preview_data["description"]},
+        {"Dataset Property": "Links", "Details": links_formatted},
+        {"Dataset Property": "Organization", "Details": preview_data["organization"]},
+        {"Dataset Property": "Number of Questions", "Details": preview_data["num_questions"]},
+        {"Dataset Property": "Choices per Question", "Details": preview_data["choices_per_question"]}
     ]
+    return pd.DataFrame(rows)
+# For standalone testing
+if __name__ == "__main__":
+    preview_data = mmlupro_dataset_preview()
+    print("Preview data generated:")
+    for key, value in preview_data.items():
+        if key != "links":
+            print(f"\n{key}:\n{value}")