File size: 3,514 Bytes
4ee33aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
dataloader: 
  batch_size : 16
  shuffle: true
  num_workers : 64
  drop_last : true
  pin_memory : true
model:
  target: cldm.cldm.ControlLDM
  params:
    # linear_start: 0.00085
    # linear_end: 0.0120
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: "jpg"
    cond_stage_key: "txt"
    control_key: "hint"
    image_size: 64
    channels: 100
    cond_stage_trainable: true
    # conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_factor: 0.18215
    use_ema: False
    only_mid_control: False

    # control_stage_config:
    #   target: cldm.cldm.ControlNet
    #   params:
    #     image_size: 32 # unused
    #     in_channels: 100
    #     hint_channels: 768
    #     model_channels: 128
    #     attention_resolutions: [ 4, 2, 1 ]
    #     num_res_blocks: 2
    #     channel_mult: [ 1, 2, 4, 4 ]
    #     num_heads: 8
    #     use_spatial_transformer: True
    #     transformer_depth: 1
    #     context_dim: 768
    #     use_checkpoint: True
    #     legacy: False
    refer_config:
      target: cldm.cldm.ReferenceNet
      params:
        image_size: 32 # unused
        hint_in_channels: 1024
        hint_out_channels: 128
        in_channels: 100
        out_channels: 100
        model_channels: 1024
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 1
        channel_mult: [ 1, 1 ]
        num_heads: 8
        use_spatial_transformer: True
        transformer_depth: 1
        context_dim: 512
        use_checkpoint: True
        dims: 1
        legacy: False


    unet_config:
      target: tortoise_model.DiffusionTts
      params:
        model_channels: 512
        num_layers: 8
        in_channels: 100
        in_latent_channels: 1024
        out_channels: 100
        dropout: 0
        use_fp16: False
        num_heads: 16
        layer_drop: .1
        unconditioned_percentage: .1
      # target: cldm.cldm.ControlledUnetModel
      # params:
      #   image_size: 32 # unused
      #   hint_in_channels: 1024
      #   hint_out_channels: 128
      #   in_channels: 100
      #   out_channels: 100
      #   model_channels: 1024
      #   attention_resolutions: [ 4, 2, 1 ]
      #   num_res_blocks: 1
      #   resblock_updown: True
      #   channel_mult: [ 1, 1]
      #   num_heads: 8
      #   use_spatial_transformer: True
      #   transformer_depth: 1
      #   context_dim: 512
      #   use_checkpoint: True
      #   dims: 1
      #   legacy: False

    cond_stage_config:
      target: cldm.cond_emb.CLIP
      params: 
        embed_dim: 512
        vision_cfg: 
          layers: 6
          width: 512
          head_width: 64
          mlp_ratio: 4.0
          patch_dropout: 0.4
          attentional_pool: False
          patch_size: 64
          image_size: 1000
          in_channels: 100
          pool_type: 'tok'
          pos_embed_type: 'learnable'
          final_ln_after_pool: false

train:
  train_batch_size : 32
  gradient_accumulate_every : 1
  train_lr : 0.0001
  train_num_steps : 1000000
  ema_update_every : 10
  ema_decay : 0.995
  adam_betas : [0.9, 0.99]
  save_and_sample_every : 1000
  timesteps : 1000
  sampling_timesteps : 1000
  results_folder : "results"
  logs_folder  : "ttts/AA_diffusion/logs"
  num_workers : 32
  eps : 0.000000001
  keep_ckpts : 3
  all_in_mem : false
dataset:
  path : "/home/hyc/tortoise_plus_zh/ttts/datasets/databaker_data.jsonl"
  gpt_path : "/home/hyc/tortoise_plus_zh/ttts/gpt/logs/2023-12-24-14-22-14/model-70.pt"