Spaces:
Build error
Build error
Commit
·
8468281
1
Parent(s):
787305e
dtata fox maybe
Browse files- Dockerfile +1 -3
- data/openwebtext/prepare.py +2 -0
Dockerfile
CHANGED
@@ -23,9 +23,7 @@ RUN mkdir -p /app/out && chmod 777 /app/out
|
|
23 |
COPY . .
|
24 |
|
25 |
# Prepare the OpenWebText dataset
|
26 |
-
RUN
|
27 |
-
cd /app/data/openwebtext && \
|
28 |
-
python prepare.py
|
29 |
|
30 |
# Command to run training
|
31 |
CMD ["python", "train.py", "--wandb_log=True"]
|
|
|
23 |
COPY . .
|
24 |
|
25 |
# Prepare the OpenWebText dataset
|
26 |
+
RUN cd /app/data/openwebtext && python prepare.py
|
|
|
|
|
27 |
|
28 |
# Command to run training
|
29 |
CMD ["python", "train.py", "--wandb_log=True"]
|
data/openwebtext/prepare.py
CHANGED
@@ -19,6 +19,8 @@ num_proc_load_dataset = num_proc
|
|
19 |
enc = tiktoken.get_encoding("gpt2")
|
20 |
|
21 |
if __name__ == '__main__':
|
|
|
|
|
22 |
# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
|
23 |
dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
|
24 |
|
|
|
19 |
enc = tiktoken.get_encoding("gpt2")
|
20 |
|
21 |
if __name__ == '__main__':
|
22 |
+
# Load dataset with trust_remote_code=True to allow custom code execution
|
23 |
+
dataset = load_dataset("openwebtext", trust_remote_code=True, num_proc=num_proc_load_dataset)
|
24 |
# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
|
25 |
dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
|
26 |
|