torinriley commited on
Commit
8468281
·
1 Parent(s): 787305e

dtata fox maybe

Browse files
Files changed (2) hide show
  1. Dockerfile +1 -3
  2. data/openwebtext/prepare.py +2 -0
Dockerfile CHANGED
@@ -23,9 +23,7 @@ RUN mkdir -p /app/out && chmod 777 /app/out
23
  COPY . .
24
 
25
  # Prepare the OpenWebText dataset
26
- RUN mkdir -p /app/data/openwebtext && \
27
- cd /app/data/openwebtext && \
28
- python prepare.py
29
 
30
  # Command to run training
31
  CMD ["python", "train.py", "--wandb_log=True"]
 
23
  COPY . .
24
 
25
  # Prepare the OpenWebText dataset
26
+ RUN cd /app/data/openwebtext && python prepare.py
 
 
27
 
28
  # Command to run training
29
  CMD ["python", "train.py", "--wandb_log=True"]
data/openwebtext/prepare.py CHANGED
@@ -19,6 +19,8 @@ num_proc_load_dataset = num_proc
19
  enc = tiktoken.get_encoding("gpt2")
20
 
21
  if __name__ == '__main__':
 
 
22
  # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
23
  dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
24
 
 
19
  enc = tiktoken.get_encoding("gpt2")
20
 
21
  if __name__ == '__main__':
22
+ # Load dataset with trust_remote_code=True to allow custom code execution
23
+ dataset = load_dataset("openwebtext", trust_remote_code=True, num_proc=num_proc_load_dataset)
24
  # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
25
  dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
26