taesiri's picture
backup
02d2765
raw
history blame
5.17 kB
import os
import json
import pandas as pd
from pathlib import Path
from datasets import Dataset, Features, Value, Sequence, Image as ImageFeature
def process_and_push_dataset(
data_dir: str, hub_repo: str, token: str, private: bool = True
):
"""
Process local dataset files and push to Hugging Face Hub.
Args:
data_dir (str): Path to the data directory containing submission folders
hub_repo (str): Name of the Hugging Face repository to push to
private (bool): Whether to make the pushed dataset private
Returns:
datasets.Dataset: The processed dataset
"""
# List to store all records
all_records = []
# Walk through all subdirectories in data_dir
for root, dirs, files in os.walk(data_dir):
for file in files:
if file == "question.json":
file_path = Path(root) / file
try:
# Read the JSON file
with open(file_path, "r", encoding="utf-8") as f:
record = json.load(f)
# Get the folder path for this record
folder_path = os.path.dirname(file_path)
# Fix image paths to include full path
if "question_images" in record:
record["question_images"] = [
str(Path(folder_path) / img_path)
for img_path in record["question_images"]
if img_path
]
if "rationale_images" in record:
record["rationale_images"] = [
str(Path(folder_path) / img_path)
for img_path in record["rationale_images"]
if img_path
]
# Flatten author_info dictionary
author_info = record.pop("author_info", {})
record.update(
{f"author_{k}": v for k, v in author_info.items()}
)
# Add the record
all_records.append(record)
except Exception as e:
print(f"Error processing {file_path}: {e}")
# Convert to DataFrame
df = pd.DataFrame(all_records)
# Sort by custom_id for consistency
if not df.empty and "custom_id" in df.columns:
df = df.sort_values("custom_id")
# Ensure all required columns exist with default values
required_columns = {
"custom_id": "",
"author_name": "",
"author_email_address": "",
"author_institution": "",
"question_categories": [],
"question": "",
"question_images": [],
"final_answer": "",
"rationale_text": "",
"rationale_images": [],
"image_attribution": "",
"subquestions_1_text": "",
"subquestions_1_answer": "",
"subquestions_2_text": "",
"subquestions_2_answer": "",
"subquestions_3_text": "",
"subquestions_3_answer": "",
"subquestions_4_text": "",
"subquestions_4_answer": "",
"subquestions_5_text": "",
"subquestions_5_answer": "",
}
for col, default_value in required_columns.items():
if col not in df.columns:
df[col] = default_value
# Define features
features = Features(
{
"custom_id": Value("string"),
"question": Value("string"),
"question_images": Sequence(ImageFeature()),
"question_categories": Sequence(Value("string")),
"final_answer": Value("string"),
"rationale_text": Value("string"),
"rationale_images": Sequence(ImageFeature()),
"image_attribution": Value("string"),
"subquestions_1_text": Value("string"),
"subquestions_1_answer": Value("string"),
"subquestions_2_text": Value("string"),
"subquestions_2_answer": Value("string"),
"subquestions_3_text": Value("string"),
"subquestions_3_answer": Value("string"),
"subquestions_4_text": Value("string"),
"subquestions_4_answer": Value("string"),
"subquestions_5_text": Value("string"),
"subquestions_5_answer": Value("string"),
"author_name": Value("string"),
"author_email_address": Value("string"),
"author_institution": Value("string"),
}
)
# Convert DataFrame to dict of lists (Hugging Face Dataset format)
dataset_dict = {col: df[col].tolist() for col in features.keys()}
# Create Dataset directly from dict
dataset = Dataset.from_dict(dataset_dict, features=features)
# Push to hub
dataset.push_to_hub(hub_repo, private=private, max_shard_size="200MB", token=token)
print(f"\nDataset Statistics:")
print(f"Total number of submissions: {len(dataset)}")
print(f"\nSuccessfully pushed dataset to {hub_repo}")
return dataset