from datasets import get_dataset_config_names, load_dataset | |
from joblib.memory import Memory | |
cache = Memory(location=".cache", verbose=0).cache | |
def _get_dataset_config_names(dataset, **kwargs): | |
return get_dataset_config_names(dataset, **kwargs) | |
def _load_dataset(dataset, subset, **kwargs): | |
return load_dataset(dataset, subset, **kwargs) | |
# Cache individual dataset items to avoid reloading entire datasets | |
def _get_dataset_item(dataset, subset, split, index, **kwargs): | |
"""Load a single item from a dataset efficiently""" | |
ds = load_dataset(dataset, subset, split=split, **kwargs) | |
return ds[index] if index < len(ds) else None | |