import os import json import pandas as pd from pathlib import Path from datasets import Dataset, Features, Value, Sequence, Image as ImageFeature def process_and_push_dataset( data_dir: str, hub_repo: str, token: str, private: bool = True ): """ Process local dataset files and push to Hugging Face Hub. Args: data_dir (str): Path to the data directory containing submission folders hub_repo (str): Name of the Hugging Face repository to push to private (bool): Whether to make the pushed dataset private Returns: datasets.Dataset: The processed dataset """ # List to store all records all_records = [] # Walk through all subdirectories in data_dir for root, dirs, files in os.walk(data_dir): for file in files: if file == "question.json": file_path = Path(root) / file try: # Read the JSON file with open(file_path, "r", encoding="utf-8") as f: record = json.load(f) # Get the folder path for this record folder_path = os.path.dirname(file_path) # Fix image paths to include full path if "question_images" in record: record["question_images"] = [ str(Path(folder_path) / img_path) for img_path in record["question_images"] if img_path ] if "rationale_images" in record: record["rationale_images"] = [ str(Path(folder_path) / img_path) for img_path in record["rationale_images"] if img_path ] # Flatten author_info dictionary author_info = record.pop("author_info", {}) record.update( {f"author_{k}": v for k, v in author_info.items()} ) # Add the record all_records.append(record) except Exception as e: print(f"Error processing {file_path}: {e}") # Convert to DataFrame df = pd.DataFrame(all_records) # Sort by custom_id for consistency if not df.empty and "custom_id" in df.columns: df = df.sort_values("custom_id") # Ensure all required columns exist with default values required_columns = { "custom_id": "", "author_name": "", "author_email_address": "", "author_institution": "", "question_categories": [], "question": "", "question_images": [], "final_answer": "", "rationale_text": "", "rationale_images": [], "image_attribution": "", "subquestions_1_text": "", "subquestions_1_answer": "", "subquestions_2_text": "", "subquestions_2_answer": "", "subquestions_3_text": "", "subquestions_3_answer": "", "subquestions_4_text": "", "subquestions_4_answer": "", "subquestions_5_text": "", "subquestions_5_answer": "", } for col, default_value in required_columns.items(): if col not in df.columns: df[col] = default_value # Define features features = Features( { "custom_id": Value("string"), "question": Value("string"), "question_images": Sequence(ImageFeature()), "question_categories": Sequence(Value("string")), "final_answer": Value("string"), "rationale_text": Value("string"), "rationale_images": Sequence(ImageFeature()), "image_attribution": Value("string"), "subquestions_1_text": Value("string"), "subquestions_1_answer": Value("string"), "subquestions_2_text": Value("string"), "subquestions_2_answer": Value("string"), "subquestions_3_text": Value("string"), "subquestions_3_answer": Value("string"), "subquestions_4_text": Value("string"), "subquestions_4_answer": Value("string"), "subquestions_5_text": Value("string"), "subquestions_5_answer": Value("string"), "author_name": Value("string"), "author_email_address": Value("string"), "author_institution": Value("string"), } ) # Convert DataFrame to dict of lists (Hugging Face Dataset format) dataset_dict = {col: df[col].tolist() for col in features.keys()} # Create Dataset directly from dict dataset = Dataset.from_dict(dataset_dict, features=features) # Push to hub dataset.push_to_hub(hub_repo, private=private, max_shard_size="200MB", token=token) print(f"\nDataset Statistics:") print(f"Total number of submissions: {len(dataset)}") print(f"\nSuccessfully pushed dataset to {hub_repo}") return dataset