Spaces:
Running
Running
File size: 2,723 Bytes
6a67483 3d85131 d1b1751 3d85131 832f54f 9a1c210 3d85131 9a1c210 3d85131 6a538fd 9a1c210 ba26a99 9a1c210 832f54f 9a1c210 ba26a99 293f8f5 3d85131 d46a843 66357ea 3d85131 6a67483 9a1c210 66357ea 293f8f5 9a1c210 8a4d47d 293f8f5 3096e07 293f8f5 434f8a6 3096e07 293f8f5 3096e07 9a1c210 293f8f5 9a1c210 293f8f5 3096e07 9a1c210 832f54f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import os
import json
import csv
import gradio as gr
import random
import time
from collections import Counter
from numpy.random import choice
from datasets import load_dataset, Dataset
from PIL import PngImagePlugin, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
PngImagePlugin.MAX_TEXT_CHUNK = 1048576 * 10 # this is 10x the amount.
"""
This code is designed to read in the ImageNet 1K ILSVRC dataset from the Hugging Face Hub,
then create a new version of this dataset with {percentage} lines with random labels based on the observed frequencies,
then upload this new version of the Hugging Face Hub, in the Data Composition organization:
https://huggingface.co/datasets/datacomp
"""
# The number of examples/instances in this dataset is copied from the model card:
# https://huggingface.co/datasets/ILSVRC/imagenet-1k
NUM_EXAMPLES = 1281167
DEV = False
FRACTIONS = [2, 4, 8, 16, 32, 64]
# Arbitrary small number of dataset examples to look at, only using in devv'ing.
DEV_AMOUNT = 10
if DEV:
NUM_EXAMPLES = DEV_AMOUNT
# Whether to read in the distribution over labels from an external text file.
READ_DISTRO = False
GATED_IMAGENET = os.environ.get("GATED_IMAGENET")
def create_subset_dataset(dataset, fraction_size):
dataset = dataset.shuffle(buffer_size=NUM_EXAMPLES)
num_samples = int(NUM_EXAMPLES / fraction_size)
sampled_dataset = dataset.take(num_samples)
return sampled_dataset
def main(percentage=10):
global randomize_subset
# Just for timing how long this takes.
start = time.time()
percentage = float(percentage)
if DEV:
dataset = load_dataset("datacomp/imagenet-1k-random" + str(percentage), split="train", streaming=True,
trust_remote_code=True, token=GATED_IMAGENET).take(DEV_AMOUNT)
else:
dataset = load_dataset("datacomp/imagenet-1k-random" + str(percentage), split="train", streaming=True,
trust_remote_code=True, token=GATED_IMAGENET)
for frac in FRACTIONS:
sampled_dataset = create_subset_dataset(dataset, frac)
# Upload the new version of the dataset (this will take awhile)
if DEV:
Dataset.from_generator(updated_dataset.__iter__).push_to_hub(
"datacomp/debug-imagenet-1k-random-" + "-" + str(percentage) + '-frac-1/' + str(frac), token=GATED_IMAGENET)
else:
Dataset.from_generator(updated_dataset.__iter__).push_to_hub(
"datacomp/imagenet-1k-random" + str(percentage) + '-frac-1/' + str(frac), token=GATED_IMAGENET)
end = time.time()
print("That took %d seconds" % (end - start))
demo = gr.Interface(fn=main, inputs="text", outputs="text")
demo.launch()
|