select-subset

Running

File size: 2,723 Bytes

import os
import json
import csv

import gradio as gr
import random
import time

from collections import Counter
from numpy.random import choice
from datasets import load_dataset, Dataset

from PIL import PngImagePlugin, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
PngImagePlugin.MAX_TEXT_CHUNK = 1048576 * 10 # this is 10x the amount.
"""
This code is designed to read in the ImageNet 1K ILSVRC dataset from the Hugging Face Hub, 
then create a new version of this dataset with {percentage} lines with random labels based on the observed frequencies,
then upload this new version of the Hugging Face Hub, in the Data Composition organization:
https://huggingface.co/datasets/datacomp
"""

# The number of examples/instances in this dataset is copied from the model card:
# https://huggingface.co/datasets/ILSVRC/imagenet-1k
NUM_EXAMPLES = 1281167
DEV = False
FRACTIONS = [2, 4, 8, 16, 32, 64]
# Arbitrary small number of dataset examples to look at, only using in devv'ing.
DEV_AMOUNT = 10
if DEV:
    NUM_EXAMPLES = DEV_AMOUNT
# Whether to read in the distribution over labels from an external text file.
READ_DISTRO = False
GATED_IMAGENET = os.environ.get("GATED_IMAGENET")


def create_subset_dataset(dataset, fraction_size):
    dataset = dataset.shuffle(buffer_size=NUM_EXAMPLES)
    num_samples = int(NUM_EXAMPLES / fraction_size)
    sampled_dataset = dataset.take(num_samples)   
    return sampled_dataset
    
def main(percentage=10):
    global randomize_subset
    # Just for timing how long this takes.
    start = time.time()

    percentage = float(percentage)
    
    if DEV:
        dataset = load_dataset("datacomp/imagenet-1k-random" + str(percentage), split="train", streaming=True,
                           trust_remote_code=True, token=GATED_IMAGENET).take(DEV_AMOUNT)
    else:
        dataset = load_dataset("datacomp/imagenet-1k-random" + str(percentage), split="train", streaming=True,
                           trust_remote_code=True, token=GATED_IMAGENET)

    for frac in FRACTIONS:
        sampled_dataset = create_subset_dataset(dataset, frac)

        # Upload the new version of the dataset (this will take awhile)
        if DEV:
            Dataset.from_generator(updated_dataset.__iter__).push_to_hub(
                "datacomp/debug-imagenet-1k-random-" + "-" + str(percentage) + '-frac-1/' + str(frac), token=GATED_IMAGENET)
        else:
            Dataset.from_generator(updated_dataset.__iter__).push_to_hub(
                "datacomp/imagenet-1k-random" + str(percentage) + '-frac-1/' + str(frac), token=GATED_IMAGENET)


    end = time.time()
    print("That took %d seconds" % (end - start))


demo = gr.Interface(fn=main, inputs="text", outputs="text")
demo.launch()