File size: 4,461 Bytes
bd62227 171e2fc bd62227 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
from typing import Optional
import math
import os
import csv
import json
import numpy as np
from argdantic import ArgParser
from pydantic import BaseModel
from tqdm import tqdm
from huggingface_hub import hf_hub_download
from common import PuzzleDatasetMetadata, dihedral_transform
CHARSET = "# SGo"
cli = ArgParser()
class DataProcessConfig(BaseModel):
source_repo: str = "sapientinc/maze-30x30-hard-1k"
output_dir: str = "data/maze-30x30-hard-1k"
subsample_size: Optional[int] = None
aug: bool = False
def convert_subset(set_name: str, config: DataProcessConfig):
# Read CSV
all_chars = set()
grid_size = None
inputs = []
labels = []
with open(hf_hub_download(config.source_repo, f"{set_name}.csv", repo_type="dataset"), newline="") as csvfile: # type: ignore
reader = csv.reader(csvfile)
next(reader) # Skip header
for source, q, a, rating in reader:
all_chars.update(q)
all_chars.update(a)
if grid_size is None:
n = int(len(q) ** 0.5)
grid_size = (n, n)
inputs.append(np.frombuffer(q.encode(), dtype=np.uint8).reshape(grid_size))
labels.append(np.frombuffer(a.encode(), dtype=np.uint8).reshape(grid_size))
# If subsample_size is specified for the training set,
# randomly sample the desired number of examples.
if set_name == "train" and config.subsample_size is not None:
total_samples = len(inputs)
if config.subsample_size < total_samples:
indices = np.random.choice(total_samples, size=config.subsample_size, replace=False)
inputs = [inputs[i] for i in indices]
labels = [labels[i] for i in indices]
# Generate dataset
results = {k: [] for k in ["inputs", "labels", "puzzle_identifiers", "puzzle_indices", "group_indices"]}
puzzle_id = 0
example_id = 0
results["puzzle_indices"].append(0)
results["group_indices"].append(0)
for inp, out in zip(tqdm(inputs), labels):
# Dihedral transformations for augmentation
for aug_idx in range(8 if (set_name == "train" and config.aug) else 1):
results["inputs"].append(dihedral_transform(inp, aug_idx))
results["labels"].append(dihedral_transform(out, aug_idx))
example_id += 1
puzzle_id += 1
results["puzzle_indices"].append(example_id)
results["puzzle_identifiers"].append(0)
# Push group
results["group_indices"].append(puzzle_id)
# Char mappings
assert len(all_chars - set(CHARSET)) == 0
char2id = np.zeros(256, np.uint8)
char2id[np.array(list(map(ord, CHARSET)))] = np.arange(len(CHARSET)) + 1
# To Numpy
def _seq_to_numpy(seq):
arr = np.vstack([char2id[s.reshape(-1)] for s in seq])
return arr
results = {
"inputs": _seq_to_numpy(results["inputs"]),
"labels": _seq_to_numpy(results["labels"]),
"group_indices": np.array(results["group_indices"], dtype=np.int32),
"puzzle_indices": np.array(results["puzzle_indices"], dtype=np.int32),
"puzzle_identifiers": np.array(results["puzzle_identifiers"], dtype=np.int32),
}
# Metadata
metadata = PuzzleDatasetMetadata(
seq_len=int(math.prod(grid_size)), # type: ignore
vocab_size=len(CHARSET) + 1, # PAD + Charset
pad_id=0,
ignore_label_id=0,
blank_identifier_id=0,
num_puzzle_identifiers=1,
total_groups=len(results["group_indices"]) - 1,
mean_puzzle_examples=1,
sets=["all"]
)
# Save metadata as JSON.
save_dir = os.path.join(config.output_dir, set_name)
os.makedirs(save_dir, exist_ok=True)
with open(os.path.join(save_dir, "dataset.json"), "w") as f:
json.dump(metadata.model_dump(), f)
# Save data
for k, v in results.items():
np.save(os.path.join(save_dir, f"all__{k}.npy"), v)
# Save IDs mapping (for visualization only)
with open(os.path.join(config.output_dir, "identifiers.json"), "w") as f:
json.dump(["<blank>"], f)
@cli.command(singleton=True)
def preprocess_data(config: DataProcessConfig):
convert_subset("train", config)
convert_subset("test", config)
if __name__ == "__main__":
cli()
|