select-subset / app.py
mcurmei's picture
Update app.py
293f8f5 verified
raw
history blame
2.72 kB
import os
import json
import csv
import gradio as gr
import random
import time
from collections import Counter
from numpy.random import choice
from datasets import load_dataset, Dataset
from PIL import PngImagePlugin, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
PngImagePlugin.MAX_TEXT_CHUNK = 1048576 * 10 # this is 10x the amount.
"""
This code is designed to read in the ImageNet 1K ILSVRC dataset from the Hugging Face Hub,
then create a new version of this dataset with {percentage} lines with random labels based on the observed frequencies,
then upload this new version of the Hugging Face Hub, in the Data Composition organization:
https://huggingface.co/datasets/datacomp
"""
# The number of examples/instances in this dataset is copied from the model card:
# https://huggingface.co/datasets/ILSVRC/imagenet-1k
NUM_EXAMPLES = 1281167
DEV = False
FRACTIONS = [2, 4, 8, 16, 32, 64]
# Arbitrary small number of dataset examples to look at, only using in devv'ing.
DEV_AMOUNT = 10
if DEV:
NUM_EXAMPLES = DEV_AMOUNT
# Whether to read in the distribution over labels from an external text file.
READ_DISTRO = False
GATED_IMAGENET = os.environ.get("GATED_IMAGENET")
def create_subset_dataset(dataset, fraction_size):
dataset = dataset.shuffle(buffer_size=NUM_EXAMPLES)
num_samples = int(NUM_EXAMPLES / fraction_size)
sampled_dataset = dataset.take(num_samples)
return sampled_dataset
def main(percentage=10):
global randomize_subset
# Just for timing how long this takes.
start = time.time()
percentage = float(percentage)
if DEV:
dataset = load_dataset("datacomp/imagenet-1k-random" + str(percentage), split="train", streaming=True,
trust_remote_code=True, token=GATED_IMAGENET).take(DEV_AMOUNT)
else:
dataset = load_dataset("datacomp/imagenet-1k-random" + str(percentage), split="train", streaming=True,
trust_remote_code=True, token=GATED_IMAGENET)
for frac in FRACTIONS:
sampled_dataset = create_subset_dataset(dataset, frac)
# Upload the new version of the dataset (this will take awhile)
if DEV:
Dataset.from_generator(updated_dataset.__iter__).push_to_hub(
"datacomp/debug-imagenet-1k-random-" + "-" + str(percentage) + '-frac-1/' + str(frac), token=GATED_IMAGENET)
else:
Dataset.from_generator(updated_dataset.__iter__).push_to_hub(
"datacomp/imagenet-1k-random" + str(percentage) + '-frac-1/' + str(frac), token=GATED_IMAGENET)
end = time.time()
print("That took %d seconds" % (end - start))
demo = gr.Interface(fn=main, inputs="text", outputs="text")
demo.launch()