Spaces:

taesiri
/

BugsBunny-EvalBuilder

Build error

App Files Files Community

taesiri commited on Feb 4

Commit

02d2765

1 Parent(s): c2606ea

backup

Browse files

Files changed (1) hide show

utils.py +140 -0

utils.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import os
+import json
+import pandas as pd
+from pathlib import Path
+from datasets import Dataset, Features, Value, Sequence, Image as ImageFeature
+def process_and_push_dataset(
+    data_dir: str, hub_repo: str, token: str, private: bool = True
+):
+    """
+    Process local dataset files and push to Hugging Face Hub.
+    Args:
+        data_dir (str): Path to the data directory containing submission folders
+        hub_repo (str): Name of the Hugging Face repository to push to
+        private (bool): Whether to make the pushed dataset private
+    Returns:
+        datasets.Dataset: The processed dataset
+    """
+    # List to store all records
+    all_records = []
+    # Walk through all subdirectories in data_dir
+    for root, dirs, files in os.walk(data_dir):
+        for file in files:
+            if file == "question.json":
+                file_path = Path(root) / file
+                try:
+                    # Read the JSON file
+                    with open(file_path, "r", encoding="utf-8") as f:
+                        record = json.load(f)
+                        # Get the folder path for this record
+                        folder_path = os.path.dirname(file_path)
+                        # Fix image paths to include full path
+                        if "question_images" in record:
+                            record["question_images"] = [
+                                str(Path(folder_path) / img_path)
+                                for img_path in record["question_images"]
+                                if img_path
+                            ]
+                        if "rationale_images" in record:
+                            record["rationale_images"] = [
+                                str(Path(folder_path) / img_path)
+                                for img_path in record["rationale_images"]
+                                if img_path
+                            ]
+                        # Flatten author_info dictionary
+                        author_info = record.pop("author_info", {})
+                        record.update(
+                            {f"author_{k}": v for k, v in author_info.items()}
+                        )
+                        # Add the record
+                        all_records.append(record)
+                except Exception as e:
+                    print(f"Error processing {file_path}: {e}")
+    # Convert to DataFrame
+    df = pd.DataFrame(all_records)
+    # Sort by custom_id for consistency
+    if not df.empty and "custom_id" in df.columns:
+        df = df.sort_values("custom_id")
+    # Ensure all required columns exist with default values
+    required_columns = {
+        "custom_id": "",
+        "author_name": "",
+        "author_email_address": "",
+        "author_institution": "",
+        "question_categories": [],
+        "question": "",
+        "question_images": [],
+        "final_answer": "",
+        "rationale_text": "",
+        "rationale_images": [],
+        "image_attribution": "",
+        "subquestions_1_text": "",
+        "subquestions_1_answer": "",
+        "subquestions_2_text": "",
+        "subquestions_2_answer": "",
+        "subquestions_3_text": "",
+        "subquestions_3_answer": "",
+        "subquestions_4_text": "",
+        "subquestions_4_answer": "",
+        "subquestions_5_text": "",
+        "subquestions_5_answer": "",
+    }
+    for col, default_value in required_columns.items():
+        if col not in df.columns:
+            df[col] = default_value
+    # Define features
+    features = Features(
+        {
+            "custom_id": Value("string"),
+            "question": Value("string"),
+            "question_images": Sequence(ImageFeature()),
+            "question_categories": Sequence(Value("string")),
+            "final_answer": Value("string"),
+            "rationale_text": Value("string"),
+            "rationale_images": Sequence(ImageFeature()),
+            "image_attribution": Value("string"),
+            "subquestions_1_text": Value("string"),
+            "subquestions_1_answer": Value("string"),
+            "subquestions_2_text": Value("string"),
+            "subquestions_2_answer": Value("string"),
+            "subquestions_3_text": Value("string"),
+            "subquestions_3_answer": Value("string"),
+            "subquestions_4_text": Value("string"),
+            "subquestions_4_answer": Value("string"),
+            "subquestions_5_text": Value("string"),
+            "subquestions_5_answer": Value("string"),
+            "author_name": Value("string"),
+            "author_email_address": Value("string"),
+            "author_institution": Value("string"),
+        }
+    )
+    # Convert DataFrame to dict of lists (Hugging Face Dataset format)
+    dataset_dict = {col: df[col].tolist() for col in features.keys()}
+    # Create Dataset directly from dict
+    dataset = Dataset.from_dict(dataset_dict, features=features)
+    # Push to hub
+    dataset.push_to_hub(hub_repo, private=private, max_shard_size="200MB", token=token)
+    print(f"\nDataset Statistics:")
+    print(f"Total number of submissions: {len(dataset)}")
+    print(f"\nSuccessfully pushed dataset to {hub_repo}")
+    return dataset