Spaces:
Running
Running
import os | |
import json | |
import csv | |
import gradio as gr | |
import random | |
import time | |
from collections import Counter | |
from numpy.random import choice | |
from datasets import load_dataset, Dataset | |
from PIL import PngImagePlugin, ImageFile | |
ImageFile.LOAD_TRUNCATED_IMAGES = True | |
PngImagePlugin.MAX_TEXT_CHUNK = 1048576 * 10 # this is 10x the amount. | |
""" | |
This code is designed to read in the ImageNet 1K ILSVRC dataset from the Hugging Face Hub, | |
then create a new version of this dataset with {percentage} lines with random labels based on the observed frequencies, | |
then upload this new version of the Hugging Face Hub, in the Data Composition organization: | |
https://huggingface.co/datasets/datacomp | |
""" | |
# The number of examples/instances in this dataset is copied from the model card: | |
# https://huggingface.co/datasets/ILSVRC/imagenet-1k | |
NUM_EXAMPLES = 1281167 | |
DEV = False | |
FRACTIONS = [2, 4, 8, 16, 32, 64] | |
# Arbitrary small number of dataset examples to look at, only using in devv'ing. | |
DEV_AMOUNT = 10 | |
if DEV: | |
NUM_EXAMPLES = DEV_AMOUNT | |
# Whether to read in the distribution over labels from an external text file. | |
READ_DISTRO = False | |
GATED_IMAGENET = os.environ.get("GATED_IMAGENET") | |
def create_subset_dataset(dataset, fraction_size): | |
dataset = dataset.shuffle(buffer_size=NUM_EXAMPLES) | |
num_samples = int(NUM_EXAMPLES / fraction_size) | |
sampled_dataset = dataset.take(num_samples) | |
return sampled_dataset | |
def main(percentage=10): | |
global randomize_subset | |
# Just for timing how long this takes. | |
start = time.time() | |
percentage = float(percentage) | |
if DEV: | |
dataset = load_dataset("datacomp/imagenet-1k-random" + str(percentage), split="train", streaming=True, | |
trust_remote_code=True, token=GATED_IMAGENET).take(DEV_AMOUNT) | |
else: | |
dataset = load_dataset("datacomp/imagenet-1k-random" + str(percentage), split="train", streaming=True, | |
trust_remote_code=True, token=GATED_IMAGENET) | |
for frac in FRACTIONS: | |
sampled_dataset = create_subset_dataset(dataset, frac) | |
# Upload the new version of the dataset (this will take awhile) | |
if DEV: | |
Dataset.from_generator(updated_dataset.__iter__).push_to_hub( | |
"datacomp/debug-imagenet-1k-random-" + "-" + str(percentage) + '-frac-1/' + str(frac), token=GATED_IMAGENET) | |
else: | |
Dataset.from_generator(updated_dataset.__iter__).push_to_hub( | |
"datacomp/imagenet-1k-random" + str(percentage) + '-frac-1/' + str(frac), token=GATED_IMAGENET) | |
end = time.time() | |
print("That took %d seconds" % (end - start)) | |
demo = gr.Interface(fn=main, inputs="text", outputs="text") | |
demo.launch() | |