Spaces:
Sleeping
Sleeping
import os | |
import json | |
import pandas as pd | |
from pathlib import Path | |
from datasets import Dataset, Features, Value, Sequence, Image as ImageFeature | |
def process_and_push_dataset( | |
data_dir: str, hub_repo: str, token: str, private: bool = True | |
): | |
""" | |
Process local dataset files and push to Hugging Face Hub. | |
Args: | |
data_dir (str): Path to the data directory containing submission folders | |
hub_repo (str): Name of the Hugging Face repository to push to | |
private (bool): Whether to make the pushed dataset private | |
Returns: | |
datasets.Dataset: The processed dataset | |
""" | |
# List to store all records | |
all_records = [] | |
# Walk through all subdirectories in data_dir | |
for root, dirs, files in os.walk(data_dir): | |
for file in files: | |
if file == "question.json": | |
file_path = Path(root) / file | |
try: | |
# Read the JSON file | |
with open(file_path, "r", encoding="utf-8") as f: | |
record = json.load(f) | |
# Get the folder path for this record | |
folder_path = os.path.dirname(file_path) | |
# Fix image paths to include full path | |
if "question_images" in record: | |
record["question_images"] = [ | |
str(Path(folder_path) / img_path) | |
for img_path in record["question_images"] | |
if img_path | |
] | |
if "rationale_images" in record: | |
record["rationale_images"] = [ | |
str(Path(folder_path) / img_path) | |
for img_path in record["rationale_images"] | |
if img_path | |
] | |
# Flatten author_info dictionary | |
author_info = record.pop("author_info", {}) | |
record.update( | |
{f"author_{k}": v for k, v in author_info.items()} | |
) | |
# Add the record | |
all_records.append(record) | |
except Exception as e: | |
print(f"Error processing {file_path}: {e}") | |
# Convert to DataFrame | |
df = pd.DataFrame(all_records) | |
# Sort by custom_id for consistency | |
if not df.empty and "custom_id" in df.columns: | |
df = df.sort_values("custom_id") | |
# Ensure all required columns exist with default values | |
required_columns = { | |
"custom_id": "", | |
"author_name": "", | |
"author_email_address": "", | |
"author_institution": "", | |
"question_categories": [], | |
"question": "", | |
"question_images": [], | |
"final_answer": "", | |
"rationale_text": "", | |
"rationale_images": [], | |
"image_attribution": "", | |
"subquestions_1_text": "", | |
"subquestions_1_answer": "", | |
"subquestions_2_text": "", | |
"subquestions_2_answer": "", | |
"subquestions_3_text": "", | |
"subquestions_3_answer": "", | |
"subquestions_4_text": "", | |
"subquestions_4_answer": "", | |
"subquestions_5_text": "", | |
"subquestions_5_answer": "", | |
} | |
for col, default_value in required_columns.items(): | |
if col not in df.columns: | |
df[col] = default_value | |
# Define features | |
features = Features( | |
{ | |
"custom_id": Value("string"), | |
"question": Value("string"), | |
"question_images": Sequence(ImageFeature()), | |
"question_categories": Sequence(Value("string")), | |
"final_answer": Value("string"), | |
"rationale_text": Value("string"), | |
"rationale_images": Sequence(ImageFeature()), | |
"image_attribution": Value("string"), | |
"subquestions_1_text": Value("string"), | |
"subquestions_1_answer": Value("string"), | |
"subquestions_2_text": Value("string"), | |
"subquestions_2_answer": Value("string"), | |
"subquestions_3_text": Value("string"), | |
"subquestions_3_answer": Value("string"), | |
"subquestions_4_text": Value("string"), | |
"subquestions_4_answer": Value("string"), | |
"subquestions_5_text": Value("string"), | |
"subquestions_5_answer": Value("string"), | |
"author_name": Value("string"), | |
"author_email_address": Value("string"), | |
"author_institution": Value("string"), | |
} | |
) | |
# Convert DataFrame to dict of lists (Hugging Face Dataset format) | |
dataset_dict = {col: df[col].tolist() for col in features.keys()} | |
# Create Dataset directly from dict | |
dataset = Dataset.from_dict(dataset_dict, features=features) | |
# Push to hub | |
dataset.push_to_hub(hub_repo, private=private, max_shard_size="200MB", token=token) | |
print(f"\nDataset Statistics:") | |
print(f"Total number of submissions: {len(dataset)}") | |
print(f"\nSuccessfully pushed dataset to {hub_repo}") | |
return dataset | |