select-subset

Running

App Files Files Community

meg HF Staff commited on Sep 27, 2024

Commit

66357ea

verified ·

1 Parent(s): 03e9604

Reading in "label_frequencies_full.csv"

Browse files

Files changed (1) hide show

app.py +21 -11

app.py CHANGED Viewed

@@ -25,21 +25,30 @@ NUM_EXAMPLES = 1281167
 # Arbitrary small number of dataset examples to look at, only using in devv'ing.
 DEV = True
 DEV_AMOUNT = 10
 # Whether to read in the distribution over labels from an external text file.
 READ_DISTRO = False
 GATED_IMAGENET = os.environ.get("GATED_IMAGENET")
-def get_label_fractions(dataset):
     print("Getting label proportions.")
-    if READ_DISTRO:
-        with open("label_distro.json", "r+") as f:
-            label_counts = json.loads(f.read())
-    else:
-        label_counts = Counter([example['label'] for example in dataset])
-        # Don't overrwrite the distribution when devving.
-        if not DEV:
-            with open("label_distro.json", "w+") as f:
-                f.write(json.dumps(label_counts))
     label_list = list(label_counts.keys())
     denom = sum(label_counts.values())
     label_fractions = [label_counts[key]/denom for key in label_counts]
@@ -58,6 +67,7 @@ def randomize_labels(examples, indices, new_random_labels):
             examples["label"][n] = new_random_labels.pop() if index in batch_subset else examples["label"][n]
     return examples
 def main(percentage=10):
     global randomize_subset
     # Just for timing how long this takes.
@@ -77,7 +87,7 @@ def main(percentage=10):
         dataset = load_dataset("ILSVRC/imagenet-1k", split="train", streaming=True,
                            trust_remote_code=True, token=GATED_IMAGENET)
-    label_list, label_fractions = get_label_fractions(dataset)
     # How many new random labels are we creating?
     num_new_labels = int(round(NUM_EXAMPLES/float(percentage)))

 # Arbitrary small number of dataset examples to look at, only using in devv'ing.
 DEV = True
 DEV_AMOUNT = 10
+if DEV:
+    NUM_EXAMPLES = DEV_AMOUNT
 # Whether to read in the distribution over labels from an external text file.
 READ_DISTRO = False
 GATED_IMAGENET = os.environ.get("GATED_IMAGENET")
+LABELS_FILE = "label_frequencies_full.csv"
+def read_label_frequencies():
+    label_counts_dict = {}
+    header_row = ['Label', 'Frequency']
+    with open(LABELS_FILE) as csvfile:
+        label_reader = csv.DictReader(csvfile)
+        assert label_reader.fieldnames == header_row
+        for row in label_reader:
+            assert row['Label'] not in label_counts_dict
+            label_counts_dict[row['Label']] = int(row['Frequency'])
+    # TODO: Can we just do this instead of the fractions? Do they really need to be normalized?
+    # label_list, label_counts = zip(*label_counts_dict.items())
+    return label_counts_dict
+def get_label_fractions(label_counts_dict):
     print("Getting label proportions.")
     label_list = list(label_counts.keys())
     denom = sum(label_counts.values())
     label_fractions = [label_counts[key]/denom for key in label_counts]
             examples["label"][n] = new_random_labels.pop() if index in batch_subset else examples["label"][n]
     return examples
 def main(percentage=10):
     global randomize_subset
     # Just for timing how long this takes.
         dataset = load_dataset("ILSVRC/imagenet-1k", split="train", streaming=True,
                            trust_remote_code=True, token=GATED_IMAGENET)
+    label_list, label_fractions = get_label_fractions(read_label_frequencies())
     # How many new random labels are we creating?
     num_new_labels = int(round(NUM_EXAMPLES/float(percentage)))