mcurmei commited on
Commit
293f8f5
·
verified ·
1 Parent(s): 87796e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -67
app.py CHANGED
@@ -24,6 +24,7 @@ https://huggingface.co/datasets/datacomp
24
  # https://huggingface.co/datasets/ILSVRC/imagenet-1k
25
  NUM_EXAMPLES = 1281167
26
  DEV = False
 
27
  # Arbitrary small number of dataset examples to look at, only using in devv'ing.
28
  DEV_AMOUNT = 10
29
  if DEV:
@@ -31,87 +32,38 @@ if DEV:
31
  # Whether to read in the distribution over labels from an external text file.
32
  READ_DISTRO = False
33
  GATED_IMAGENET = os.environ.get("GATED_IMAGENET")
34
- LABELS_FILE = "label_frequencies_full.csv"
35
-
36
-
37
- def read_label_frequencies():
38
- label_counts_dict = {}
39
- header_row = ['Label', 'Frequency']
40
- with open(LABELS_FILE) as csvfile:
41
- label_reader = csv.DictReader(csvfile)
42
- assert label_reader.fieldnames == header_row
43
- for row in label_reader:
44
- assert row['Label'] not in label_counts_dict
45
- label_counts_dict[row['Label']] = int(row['Frequency'])
46
- # TODO: Can we just do this instead of the fractions? Do they really need to be normalized?
47
- # label_list, label_counts = zip(*label_counts_dict.items())
48
- return label_counts_dict
49
-
50
-
51
- def get_label_fractions(label_counts_dict):
52
- print("Getting label proportions.")
53
- label_list = list(label_counts_dict.keys())
54
- denom = sum(label_counts_dict.values())
55
- label_fractions = [label_counts_dict[key]/denom for key in label_counts_dict]
56
- return label_list, label_fractions
57
-
58
-
59
- def randomize_labels(examples, indices, new_random_labels):
60
- # What set of examples should be randomized in this batch?
61
- # This is the intersection of the batch indices and the indices we randomly selected to change the labels of.
62
- batch_subset = list(set(indices) & randomize_subset)
63
- # If this batch has indices that we're changing the label of....
64
- if batch_subset != []:
65
- # Change the label to a random integer between 0 and 9
66
- for n in range(len(indices)):
67
- index = indices[n]
68
- examples["label"][n] = new_random_labels.pop() if index in batch_subset else examples["label"][n]
69
- return examples
70
 
71
 
 
 
 
 
 
 
72
  def main(percentage=10):
73
  global randomize_subset
74
  # Just for timing how long this takes.
75
  start = time.time()
76
 
77
  percentage = float(percentage)
78
- print("Randomizing %d percent of the data." % percentage)
79
- # Set the random seed, based on the percentage, so that our random changes are reproducible.
80
- random.seed(percentage)
81
-
82
- # Load the dataset from the HF hub. Use streaming so as not to load the entire dataset at once.
83
- # Use the .take(DEV_AMOUNT) to only grab a small chunk of instances to develop with.
84
  if DEV:
85
- dataset = load_dataset("ILSVRC/imagenet-1k", split="train", streaming=True,
86
  trust_remote_code=True, token=GATED_IMAGENET).take(DEV_AMOUNT)
87
  else:
88
- dataset = load_dataset("ILSVRC/imagenet-1k", split="train", streaming=True,
89
  trust_remote_code=True, token=GATED_IMAGENET)
90
 
91
- label_list, label_fractions = get_label_fractions(read_label_frequencies())
92
-
93
- # How many new random labels are we creating?
94
- num_new_labels = int(round(NUM_EXAMPLES * float(percentage) * .01))
95
-
96
- # Create a set of indices that are randomly chosen, to change their labels.
97
- # Specifically, randomly choose num_new_labels indices.
98
- randomize_subset = set(random.sample(range(0, NUM_EXAMPLES), num_new_labels))
99
-
100
- # Randomly choose what the new label values are, following the observed label frequencies.
101
- new_random_labels = list(choice(a=label_list, size=num_new_labels, p=label_fractions))
102
 
103
- # Update the dataset so that the labels are randomized
104
- updated_dataset = dataset.map(randomize_labels, with_indices=True,
105
- fn_kwargs={'new_random_labels':new_random_labels},
106
- features=dataset.features, batched=True)
107
-
108
- # Upload the new version of the dataset (this will take awhile)
109
- if DEV:
110
- Dataset.from_generator(updated_dataset.__iter__).push_to_hub(
111
- "datacomp/imagenet-1k-random-debug" + str(DEV_AMOUNT) + "-" + str(percentage), token=GATED_IMAGENET)
112
- else:
113
- Dataset.from_generator(updated_dataset.__iter__).push_to_hub(
114
- "datacomp/imagenet-1k-random" + str(percentage), token=GATED_IMAGENET)
115
 
116
 
117
  end = time.time()
 
24
  # https://huggingface.co/datasets/ILSVRC/imagenet-1k
25
  NUM_EXAMPLES = 1281167
26
  DEV = False
27
+ FRACTIONS = [2, 4, 8, 16, 32, 64]
28
  # Arbitrary small number of dataset examples to look at, only using in devv'ing.
29
  DEV_AMOUNT = 10
30
  if DEV:
 
32
  # Whether to read in the distribution over labels from an external text file.
33
  READ_DISTRO = False
34
  GATED_IMAGENET = os.environ.get("GATED_IMAGENET")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
 
37
+ def create_subset_dataset(dataset, fraction_size):
38
+ dataset = dataset.shuffle(buffer_size=NUM_EXAMPLES)
39
+ num_samples = int(NUM_EXAMPLES / fraction_size)
40
+ sampled_dataset = dataset.take(num_samples)
41
+ return sampled_dataset
42
+
43
  def main(percentage=10):
44
  global randomize_subset
45
  # Just for timing how long this takes.
46
  start = time.time()
47
 
48
  percentage = float(percentage)
49
+
 
 
 
 
 
50
  if DEV:
51
+ dataset = load_dataset("datacomp/imagenet-1k-random" + str(percentage), split="train", streaming=True,
52
  trust_remote_code=True, token=GATED_IMAGENET).take(DEV_AMOUNT)
53
  else:
54
+ dataset = load_dataset("datacomp/imagenet-1k-random" + str(percentage), split="train", streaming=True,
55
  trust_remote_code=True, token=GATED_IMAGENET)
56
 
57
+ for frac in FRACTIONS:
58
+ sampled_dataset = create_subset_dataset(dataset, frac)
 
 
 
 
 
 
 
 
 
59
 
60
+ # Upload the new version of the dataset (this will take awhile)
61
+ if DEV:
62
+ Dataset.from_generator(updated_dataset.__iter__).push_to_hub(
63
+ "datacomp/debug-imagenet-1k-random-" + "-" + str(percentage) + '-frac-1/' + str(frac), token=GATED_IMAGENET)
64
+ else:
65
+ Dataset.from_generator(updated_dataset.__iter__).push_to_hub(
66
+ "datacomp/imagenet-1k-random" + str(percentage) + '-frac-1/' + str(frac), token=GATED_IMAGENET)
 
 
 
 
 
67
 
68
 
69
  end = time.time()