Spaces:
Sleeping
Sleeping
File size: 3,514 Bytes
4ee33aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
dataloader:
batch_size : 16
shuffle: true
num_workers : 64
drop_last : true
pin_memory : true
model:
target: cldm.cldm.ControlLDM
params:
# linear_start: 0.00085
# linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: "jpg"
cond_stage_key: "txt"
control_key: "hint"
image_size: 64
channels: 100
cond_stage_trainable: true
# conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.18215
use_ema: False
only_mid_control: False
# control_stage_config:
# target: cldm.cldm.ControlNet
# params:
# image_size: 32 # unused
# in_channels: 100
# hint_channels: 768
# model_channels: 128
# attention_resolutions: [ 4, 2, 1 ]
# num_res_blocks: 2
# channel_mult: [ 1, 2, 4, 4 ]
# num_heads: 8
# use_spatial_transformer: True
# transformer_depth: 1
# context_dim: 768
# use_checkpoint: True
# legacy: False
refer_config:
target: cldm.cldm.ReferenceNet
params:
image_size: 32 # unused
hint_in_channels: 1024
hint_out_channels: 128
in_channels: 100
out_channels: 100
model_channels: 1024
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 1
channel_mult: [ 1, 1 ]
num_heads: 8
use_spatial_transformer: True
transformer_depth: 1
context_dim: 512
use_checkpoint: True
dims: 1
legacy: False
unet_config:
target: tortoise_model.DiffusionTts
params:
model_channels: 512
num_layers: 8
in_channels: 100
in_latent_channels: 1024
out_channels: 100
dropout: 0
use_fp16: False
num_heads: 16
layer_drop: .1
unconditioned_percentage: .1
# target: cldm.cldm.ControlledUnetModel
# params:
# image_size: 32 # unused
# hint_in_channels: 1024
# hint_out_channels: 128
# in_channels: 100
# out_channels: 100
# model_channels: 1024
# attention_resolutions: [ 4, 2, 1 ]
# num_res_blocks: 1
# resblock_updown: True
# channel_mult: [ 1, 1]
# num_heads: 8
# use_spatial_transformer: True
# transformer_depth: 1
# context_dim: 512
# use_checkpoint: True
# dims: 1
# legacy: False
cond_stage_config:
target: cldm.cond_emb.CLIP
params:
embed_dim: 512
vision_cfg:
layers: 6
width: 512
head_width: 64
mlp_ratio: 4.0
patch_dropout: 0.4
attentional_pool: False
patch_size: 64
image_size: 1000
in_channels: 100
pool_type: 'tok'
pos_embed_type: 'learnable'
final_ln_after_pool: false
train:
train_batch_size : 32
gradient_accumulate_every : 1
train_lr : 0.0001
train_num_steps : 1000000
ema_update_every : 10
ema_decay : 0.995
adam_betas : [0.9, 0.99]
save_and_sample_every : 1000
timesteps : 1000
sampling_timesteps : 1000
results_folder : "results"
logs_folder : "ttts/AA_diffusion/logs"
num_workers : 32
eps : 0.000000001
keep_ckpts : 3
all_in_mem : false
dataset:
path : "/home/hyc/tortoise_plus_zh/ttts/datasets/databaker_data.jsonl"
gpt_path : "/home/hyc/tortoise_plus_zh/ttts/gpt/logs/2023-12-24-14-22-14/model-70.pt"
|