File size: 4,644 Bytes
6a67483
3d85131
 
832f54f
9a1c210
 
3d85131
 
 
9a1c210
3d85131
6a538fd
 
 
9a1c210
 
 
 
 
 
832f54f
9a1c210
 
 
3d85131
 
d46a843
3d85131
 
6a67483
9a1c210
31f2126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a1c210
 
 
 
 
 
8a4d47d
9a1c210
 
 
 
 
 
3096e07
 
434f8a6
3096e07
 
 
9a1c210
1e07ef1
3d85131
 
 
 
9a1c210
3d85131
 
 
 
2befb5b
9a1c210
 
 
 
 
 
3096e07
 
3d85131
3096e07
 
 
 
9a1c210
 
 
 
 
 
832f54f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import json

import gradio as gr
import random
import time

from collections import Counter
from numpy.random import choice
from datasets import load_dataset, Dataset

from PIL import PngImagePlugin, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
PngImagePlugin.MAX_TEXT_CHUNK = 1048576 * 10 # this is 10x the amount.
"""
This code is designed to read in the ImageNet 1K ILSVRC dataset from the Hugging Face Hub, 
then create a new version of this dataset with {percentage} lines with random labels between 0-9,
then upload this new version of the Hugging Face Hub, in the Data Composition organization:
https://huggingface.co/datasets/datacomp
"""

# The number of examples/instances in this dataset is copied from the model card:
# https://huggingface.co/datasets/ILSVRC/imagenet-1k
NUM_EXAMPLES = 1281167
# Arbitrary small number of dataset examples to look at, only using in devv'ing.
DEV = True
DEV_AMOUNT = 10
# Whether to read in the distribution over labels from an external text file.
READ_DISTRO = False
GATED_IMAGENET = os.environ.get("GATED_IMAGENET")

def get_label_fractions(dataset):
    print("Getting label proportions.")
    if READ_DISTRO:
        with open("label_distro.json", "r+") as f:
            label_counts = json.loads(f.read())
    else:
        label_counts = Counter([example['label'] for example in dataset])
        # Don't overrwrite the distribution when devving.
        if not DEV:
            with open("label_distro.json", "w+") as f:
                f.write(json.dumps(label_counts))
    label_list = list(label_counts.keys())
    denom = sum(label_counts.values())
    label_fractions = [label_counts[key]/denom for key in label_counts]
    return label_list, label_fractions


def randomize_labels(examples, indices):
    # What set of examples should be randomized in this batch?
    # This is the intersection of the batch indices and the indices we randomly selected to change the labels of.
    batch_subset = list(set(indices) & randomize_subset)
    # If this batch has indices that we're changing the label of....
    if batch_subset != []:
        # Change the label to a random integer between 0 and 9
        for n in range(len(indices)):
            index = indices[n]
            examples["label"][n] = new_random_labels.pop() if index in batch_subset else examples["label"][n]
    return examples

def main(percentage=10):
    global randomize_subset
    # Just for timing how long this takes.
    start = time.time()

    percentage = float(percentage)
    print("Randomizing %d percent of the data." % percentage)
    # Set the random seed, based on the percentage, so that our random changes are reproducible.
    random.seed(percentage)

    # Load the dataset from the HF hub. Use streaming so as not to load the entire dataset at once.
    # Use the .take(DEV_AMOUNT) to only grab a small chunk of instances to develop with.
    if DEV:
        dataset = load_dataset("ILSVRC/imagenet-1k", split="train", streaming=True,
                           trust_remote_code=True, token=GATED_IMAGENET).take(DEV_AMOUNT)
    else:
        dataset = load_dataset("ILSVRC/imagenet-1k", split="train", streaming=True,
                           trust_remote_code=True, token=GATED_IMAGENET)

    label_list, label_fractions = get_label_fractions(dataset)

    # How many new random labels are we creating?
    num_new_labels = int(round(NUM_EXAMPLES/float(percentage)))            
    
    # Create a set of indices that are randomly chosen, to change their labels.
    # Specifically, randomly choose num_new_labels indices.
    randomize_subset = set(random.sample(range(0, NUM_EXAMPLES), num_new_labels))
    
    # Randomly choose what the new label values are, following the observed label frequencies.
    new_random_labels = list(choice(a=label_list, size=num_new_labels, p=label_fractions))

    # Update the dataset so that the labels are randomized
    updated_dataset = dataset.map(randomize_labels, with_indices=True,
                                  features=dataset.features, batched=True)

    # Upload the new version of the dataset (this will take awhile)
    if DEV:
        Dataset.from_generator(updated_dataset.__iter__).push_to_hub(
            "datacomp/imagenet-1k-random-debug" + str(DEV_AMOUNT) + "-" + str(percentage), token=GATED_IMAGENET)
    else:
        Dataset.from_generator(updated_dataset.__iter__).push_to_hub(
            "datacomp/imagenet-1k-random" + str(percentage), token=GATED_IMAGENET)


    end = time.time()
    print("That took %d seconds" % (end - start))


demo = gr.Interface(fn=main, inputs="text", outputs="text")
demo.launch()