taesiri commited on
Commit
02d2765
·
1 Parent(s): c2606ea
Files changed (1) hide show
  1. utils.py +140 -0
utils.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pandas as pd
4
+ from pathlib import Path
5
+ from datasets import Dataset, Features, Value, Sequence, Image as ImageFeature
6
+
7
+
8
+ def process_and_push_dataset(
9
+ data_dir: str, hub_repo: str, token: str, private: bool = True
10
+ ):
11
+ """
12
+ Process local dataset files and push to Hugging Face Hub.
13
+
14
+ Args:
15
+ data_dir (str): Path to the data directory containing submission folders
16
+ hub_repo (str): Name of the Hugging Face repository to push to
17
+ private (bool): Whether to make the pushed dataset private
18
+
19
+ Returns:
20
+ datasets.Dataset: The processed dataset
21
+ """
22
+ # List to store all records
23
+ all_records = []
24
+
25
+ # Walk through all subdirectories in data_dir
26
+ for root, dirs, files in os.walk(data_dir):
27
+ for file in files:
28
+ if file == "question.json":
29
+ file_path = Path(root) / file
30
+ try:
31
+ # Read the JSON file
32
+ with open(file_path, "r", encoding="utf-8") as f:
33
+ record = json.load(f)
34
+
35
+ # Get the folder path for this record
36
+ folder_path = os.path.dirname(file_path)
37
+
38
+ # Fix image paths to include full path
39
+ if "question_images" in record:
40
+ record["question_images"] = [
41
+ str(Path(folder_path) / img_path)
42
+ for img_path in record["question_images"]
43
+ if img_path
44
+ ]
45
+
46
+ if "rationale_images" in record:
47
+ record["rationale_images"] = [
48
+ str(Path(folder_path) / img_path)
49
+ for img_path in record["rationale_images"]
50
+ if img_path
51
+ ]
52
+
53
+ # Flatten author_info dictionary
54
+ author_info = record.pop("author_info", {})
55
+ record.update(
56
+ {f"author_{k}": v for k, v in author_info.items()}
57
+ )
58
+
59
+ # Add the record
60
+ all_records.append(record)
61
+ except Exception as e:
62
+ print(f"Error processing {file_path}: {e}")
63
+
64
+ # Convert to DataFrame
65
+ df = pd.DataFrame(all_records)
66
+
67
+ # Sort by custom_id for consistency
68
+ if not df.empty and "custom_id" in df.columns:
69
+ df = df.sort_values("custom_id")
70
+
71
+ # Ensure all required columns exist with default values
72
+ required_columns = {
73
+ "custom_id": "",
74
+ "author_name": "",
75
+ "author_email_address": "",
76
+ "author_institution": "",
77
+ "question_categories": [],
78
+ "question": "",
79
+ "question_images": [],
80
+ "final_answer": "",
81
+ "rationale_text": "",
82
+ "rationale_images": [],
83
+ "image_attribution": "",
84
+ "subquestions_1_text": "",
85
+ "subquestions_1_answer": "",
86
+ "subquestions_2_text": "",
87
+ "subquestions_2_answer": "",
88
+ "subquestions_3_text": "",
89
+ "subquestions_3_answer": "",
90
+ "subquestions_4_text": "",
91
+ "subquestions_4_answer": "",
92
+ "subquestions_5_text": "",
93
+ "subquestions_5_answer": "",
94
+ }
95
+
96
+ for col, default_value in required_columns.items():
97
+ if col not in df.columns:
98
+ df[col] = default_value
99
+
100
+ # Define features
101
+ features = Features(
102
+ {
103
+ "custom_id": Value("string"),
104
+ "question": Value("string"),
105
+ "question_images": Sequence(ImageFeature()),
106
+ "question_categories": Sequence(Value("string")),
107
+ "final_answer": Value("string"),
108
+ "rationale_text": Value("string"),
109
+ "rationale_images": Sequence(ImageFeature()),
110
+ "image_attribution": Value("string"),
111
+ "subquestions_1_text": Value("string"),
112
+ "subquestions_1_answer": Value("string"),
113
+ "subquestions_2_text": Value("string"),
114
+ "subquestions_2_answer": Value("string"),
115
+ "subquestions_3_text": Value("string"),
116
+ "subquestions_3_answer": Value("string"),
117
+ "subquestions_4_text": Value("string"),
118
+ "subquestions_4_answer": Value("string"),
119
+ "subquestions_5_text": Value("string"),
120
+ "subquestions_5_answer": Value("string"),
121
+ "author_name": Value("string"),
122
+ "author_email_address": Value("string"),
123
+ "author_institution": Value("string"),
124
+ }
125
+ )
126
+
127
+ # Convert DataFrame to dict of lists (Hugging Face Dataset format)
128
+ dataset_dict = {col: df[col].tolist() for col in features.keys()}
129
+
130
+ # Create Dataset directly from dict
131
+ dataset = Dataset.from_dict(dataset_dict, features=features)
132
+
133
+ # Push to hub
134
+ dataset.push_to_hub(hub_repo, private=private, max_shard_size="200MB", token=token)
135
+
136
+ print(f"\nDataset Statistics:")
137
+ print(f"Total number of submissions: {len(dataset)}")
138
+ print(f"\nSuccessfully pushed dataset to {hub_repo}")
139
+
140
+ return dataset