Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	File size: 1,117 Bytes
			
			| 5282eae | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | import datasets
import os
from tqdm import tqdm
import webdataset as wds
import json
DATASET_ROOT = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/the_pile/all/train"
OUT_DIR = "/gpfs/u/home/LMCG/LMCGljnn/scratch-shared/junyan/raw/the_pile"
SAMPLE_PER_SHARD = 100000
if __name__ == "__main__":
    os.makedirs(OUT_DIR)
    print("load dataset...")
    pile = datasets.load_from_disk(DATASET_ROOT)
    total_num = pile.num_rows
    print("total num:", total_num)
    num = 0
    pbar = tqdm(total=total_num)
    with wds.ShardWriter(OUT_DIR+"/%05d.tar", maxcount=SAMPLE_PER_SHARD, encoder=False) as sink:
        for sample in pile.iter(4096):
            for text, meta in zip(sample["text"], sample["meta"]):
                pbar.update(1)
                if meta.get("pile_set_name", None) == "Github":
                    continue
                num += 1
                sink.write({
                    '__key__': str(num),
                    'txt': text.encode("utf-8"),
                    'json': json.dumps(meta, indent=4).encode("utf-8"),
                })
    print(f"{num} out of {total_num} is written")
 |