meg HF Staff commited on
Commit
9a1c210
·
verified ·
1 Parent(s): 832f54f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -3
app.py CHANGED
@@ -1,7 +1,66 @@
1
  import gradio as gr
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  demo.launch()
 
1
  import gradio as gr
2
+ import random
3
+ import time
4
+ from datasets import load_dataset, Dataset
5
 
6
+ """
7
+ This code is designed to read in the ImageNet 1K ILSVRC dataset from the Hugging Face Hub,
8
+ then create a new version of this dataset with {percentage} lines with random labels between 0-9,
9
+ then upload this new version of the Hugging Face Hub, in the Data Composition organization:
10
+ https://huggingface.co/datasets/datacomp
11
+ """
12
 
13
+ # The number of examples/instances in this dataset is copied from the model card:
14
+ # https://huggingface.co/datasets/ILSVRC/imagenet-1k
15
+ NUM_EXAMPLES = 1281167
16
+ # Arbitrary small number, only using in devv'ing (uncomment #.take(DEV_AMOUNT) below to use it).
17
+ DEV_AMOUNT = 100
18
+
19
+
20
+ def main(percentage=10):
21
+ global randomize_subset
22
+ # Just for timing how long this takes.
23
+ start = time.time()
24
+
25
+ print("Randomizing %d percent of the data." % percentage)
26
+ # Set the random seed, based on the percentage, so that our random changes are reproducible.
27
+ random.seed(percentage)
28
+
29
+ # Load the dataset from the HF hub. Use streaming so as not to load the entire dataset at once.
30
+ # Use the .take(DEV_AMOUNT) to only grab a small chunk of instances to develop with.
31
+ dataset = load_dataset("ILSVRC/imagenet-1k", split="train", streaming=True,
32
+ trust_remote_code=True).take(DEV_AMOUNT)
33
+
34
+ # Create a set of indices that are randomly chosen, to change their labels.
35
+ # Specifically, randomly choose NUM_EXAMPLES/percentage indices.
36
+ randomize_subset = set(random.sample(range(0, NUM_EXAMPLES), round(
37
+ NUM_EXAMPLES / float(percentage))))
38
+
39
+ # Update the dataset so that the labels are randomized
40
+ updated_dataset = dataset.map(randomize_labels, with_indices=True,
41
+ features=dataset.features, batched=True)
42
+
43
+ # Upload the new version of the dataset (this will take awhile)
44
+ Dataset.from_generator(updated_dataset.__iter__).push_to_hub(
45
+ "datacomp/imagenet-1k-random" + str(percentage))
46
+
47
+ end = time.time()
48
+ print("That took %d seconds" % (end - start))
49
+
50
+
51
+ def randomize_labels(examples, indices):
52
+ # What set of examples should be randomized in this batch?
53
+ # This is the intersection of the batch indices and the indices we randomly selected to change the labels of.
54
+ batch_subset = list(set(indices) & randomize_subset)
55
+ # If this batch has indices that we're changing the label of....
56
+ if batch_subset != []:
57
+ # Change the label to a random integer between 0 and 9
58
+ for n in range(len(indices)):
59
+ index = indices[n]
60
+ examples["label"][n] = random.randint(0,
61
+ 9) if index in batch_subset else \
62
+ examples["label"][n]
63
+ return examples
64
+
65
+ demo = gr.Interface(fn=main, inputs="text", outputs="text")
66
  demo.launch()