usm3d
/

tools

Model card Files Files and versions

xet

Community

jacklangerman commited on Apr 26, 2024

Commit

edc7860

1 Parent(s): e5605f2

add streaming support

Browse files

Files changed (1) hide show

hoho/hoho.py +30 -5

hoho/hoho.py CHANGED Viewed

@@ -3,8 +3,13 @@ import json
 import shutil
 from pathlib import Path
 from typing import Dict
 from PIL import ImageFile
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 LOCAL_DATADIR = None
@@ -29,11 +34,11 @@ def setup(local_dir='./data/usm-training-data/data'):
     else:
         LOCAL_DATADIR = local_val_datadir
         print(f"Using {LOCAL_DATADIR} as the data directory (we are running locally)")
-    # os.system("ls -lahtr")
-    # os.system(f"ls -lahtr {LOCAL_DATADIR}")
-    assert LOCAL_DATADIR.exists(), f"Data directory {LOCAL_DATADIR} does not exist"
     return LOCAL_DATADIR
@@ -286,7 +291,9 @@ def get_params():
 import webdataset as wds
 import numpy as np
-def get_dataset(decode='pil', proc=proc, split='train', dataset_type='webdataset'):
     if LOCAL_DATADIR is None:
         raise ValueError('LOCAL_DATADIR is not set. Please run setup() first.')
@@ -295,8 +302,24 @@ def get_dataset(decode='pil', proc=proc, split='train', dataset_type='webdataset
         local_dir = local_dir / split
     paths = [str(p) for p in local_dir.rglob('*.tar.gz')]
     dataset = wds.WebDataset(paths)
     if decode is not None:
         dataset = dataset.decode(decode)
     else:
@@ -315,6 +338,8 @@ def get_dataset(decode='pil', proc=proc, split='train', dataset_type='webdataset
             return datasets.IterableDataset.from_generator(lambda: dataset.iterator())
         elif split == 'val':
             return datasets.IterableDataset.from_generator(lambda: dataset.iterator())

 import shutil
 from pathlib import Path
 from typing import Dict
+import warnings
 from PIL import ImageFile
+from huggingface_hub.utils._headers import build_hf_headers # note: using _headers
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 LOCAL_DATADIR = None
     else:
         LOCAL_DATADIR = local_val_datadir
         print(f"Using {LOCAL_DATADIR} as the data directory (we are running locally)")
+    if not LOCAL_DATADIR.exists():
+        warnings.warn(f"Data directory {LOCAL_DATADIR} does not exist: creating it...")
+        LOCAL_DATADIR.mkdir(parents=True)
     return LOCAL_DATADIR
 import webdataset as wds
 import numpy as np
+SHARD_IDS = {'train': (0, 25), 'val': (25, 26), 'public': (26, 27), 'private': (27, 32)}
+def get_dataset(decode='pil', proc=proc, split='train', dataset_type='webdataset', stream=True):
     if LOCAL_DATADIR is None:
         raise ValueError('LOCAL_DATADIR is not set. Please run setup() first.')
         local_dir = local_dir / split
     paths = [str(p) for p in local_dir.rglob('*.tar.gz')]
+    msg = f'no tarfiles found in {local_dir}.'
+    if len(paths) == 0:
+        if stream:
+            if split=='all': split = 'train'
+            warnings.warn('streaming isn\'t using with \'all\': changing `split` to \'train\'')
+            warnings.warn(msg)
+            if split == 'val':
+                names = [f'data/val/inputs/hoho_v3_{i:03}-of-032.tar.gz' for i in range(*SHARD_IDS[split])]
+            elif split == 'train':
+                names = [f'data/train/hoho_v3_{i:03}-of-032.tar.gz' for i in range(*SHARD_IDS[split])]
+            auth = build_hf_headers()['authorization']
+            paths = [f"pipe:curl -L -s https://huggingface.co/datasets/usm3d/hoho-train-set/resolve/main/{name} -H 'Authorization: {auth}'" for name in names]
+        else:
+            raise FileNotFoundError(msg)
     dataset = wds.WebDataset(paths)
     if decode is not None:
         dataset = dataset.decode(decode)
     else:
             return datasets.IterableDataset.from_generator(lambda: dataset.iterator())
         elif split == 'val':
             return datasets.IterableDataset.from_generator(lambda: dataset.iterator())
+        else:
+            raise NotImplementedError('only train and val are implemented as hf datasets')