File size: 2,723 Bytes
6a67483
3d85131
d1b1751
3d85131
832f54f
9a1c210
 
3d85131
 
 
9a1c210
3d85131
6a538fd
 
 
9a1c210
 
ba26a99
9a1c210
 
 
832f54f
9a1c210
 
 
ba26a99
293f8f5
3d85131
d46a843
66357ea
 
3d85131
 
6a67483
9a1c210
66357ea
293f8f5
 
 
 
 
 
9a1c210
 
 
 
 
8a4d47d
293f8f5
3096e07
293f8f5
434f8a6
3096e07
293f8f5
3096e07
9a1c210
293f8f5
 
9a1c210
293f8f5
 
 
 
 
 
 
3096e07
9a1c210
 
 
 
 
 
832f54f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
import json
import csv

import gradio as gr
import random
import time

from collections import Counter
from numpy.random import choice
from datasets import load_dataset, Dataset

from PIL import PngImagePlugin, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
PngImagePlugin.MAX_TEXT_CHUNK = 1048576 * 10 # this is 10x the amount.
"""
This code is designed to read in the ImageNet 1K ILSVRC dataset from the Hugging Face Hub, 
then create a new version of this dataset with {percentage} lines with random labels based on the observed frequencies,
then upload this new version of the Hugging Face Hub, in the Data Composition organization:
https://huggingface.co/datasets/datacomp
"""

# The number of examples/instances in this dataset is copied from the model card:
# https://huggingface.co/datasets/ILSVRC/imagenet-1k
NUM_EXAMPLES = 1281167
DEV = False
FRACTIONS = [2, 4, 8, 16, 32, 64]
# Arbitrary small number of dataset examples to look at, only using in devv'ing.
DEV_AMOUNT = 10
if DEV:
    NUM_EXAMPLES = DEV_AMOUNT
# Whether to read in the distribution over labels from an external text file.
READ_DISTRO = False
GATED_IMAGENET = os.environ.get("GATED_IMAGENET")


def create_subset_dataset(dataset, fraction_size):
    dataset = dataset.shuffle(buffer_size=NUM_EXAMPLES)
    num_samples = int(NUM_EXAMPLES / fraction_size)
    sampled_dataset = dataset.take(num_samples)   
    return sampled_dataset
    
def main(percentage=10):
    global randomize_subset
    # Just for timing how long this takes.
    start = time.time()

    percentage = float(percentage)
    
    if DEV:
        dataset = load_dataset("datacomp/imagenet-1k-random" + str(percentage), split="train", streaming=True,
                           trust_remote_code=True, token=GATED_IMAGENET).take(DEV_AMOUNT)
    else:
        dataset = load_dataset("datacomp/imagenet-1k-random" + str(percentage), split="train", streaming=True,
                           trust_remote_code=True, token=GATED_IMAGENET)

    for frac in FRACTIONS:
        sampled_dataset = create_subset_dataset(dataset, frac)

        # Upload the new version of the dataset (this will take awhile)
        if DEV:
            Dataset.from_generator(updated_dataset.__iter__).push_to_hub(
                "datacomp/debug-imagenet-1k-random-" + "-" + str(percentage) + '-frac-1/' + str(frac), token=GATED_IMAGENET)
        else:
            Dataset.from_generator(updated_dataset.__iter__).push_to_hub(
                "datacomp/imagenet-1k-random" + str(percentage) + '-frac-1/' + str(frac), token=GATED_IMAGENET)


    end = time.time()
    print("That took %d seconds" % (end - start))


demo = gr.Interface(fn=main, inputs="text", outputs="text")
demo.launch()