|
ngpus: 4 |
|
tokens: 50257 |
|
compile: true |
|
load_dir: null |
|
work_dir: null |
|
wandb: |
|
project: openwebtext |
|
run_name: pred_mask_nowe |
|
id: null |
|
is_resume: false |
|
training: |
|
batch_size: 256 |
|
accum: 2 |
|
n_iters: 1300001 |
|
snapshot_freq: 50000 |
|
log_freq: 50 |
|
eval_freq: 100 |
|
snapshot_freq_for_preemption: 10000 |
|
weight: standard |
|
snapshot_sampling: true |
|
ema: 0.9999 |
|
weighted_by_time: false |
|
data: |
|
train: openwebtext |
|
valid: wikitext103 |
|
cache_dir: /pscratch/sd/s/sulinl/data |
|
num_proc: 64 |
|
graph: |
|
type: uniform |
|
file: data |
|
report_all: false |
|
noise: |
|
type: loglinear |
|
sigma_min: 0.0001 |
|
sigma_max: 20 |
|
sampling: |
|
predictor: euler |
|
steps: 128 |
|
noise_removal: true |
|
eval: |
|
batch_size: 256 |
|
perplexity: true |
|
perplexity_batch_size: 32 |
|
optim: |
|
weight_decay: 0 |
|
optimizer: AdamW |
|
lr: 0.0003 |
|
beta1: 0.9 |
|
beta2: 0.999 |
|
eps: 1.0e-08 |
|
warmup: 2500 |
|
grad_clip: 1.0 |
|
model: |
|
name: small |
|
type: ddit |
|
hidden_size: 768 |
|
cond_dim: 128 |
|
length: 1024 |
|
n_blocks: 12 |
|
n_heads: 12 |
|
scale_by_sigma: true |
|
dropout: 0.1 |
|
|