dataloader: batch_size : 16 shuffle: true num_workers : 64 drop_last : true pin_memory : true model: target: cldm.cldm.ControlLDM params: # linear_start: 0.00085 # linear_end: 0.0120 num_timesteps_cond: 1 log_every_t: 200 timesteps: 1000 first_stage_key: "jpg" cond_stage_key: "txt" control_key: "hint" image_size: 64 channels: 100 cond_stage_trainable: true # conditioning_key: crossattn monitor: val/loss_simple_ema scale_factor: 0.18215 use_ema: False only_mid_control: False # control_stage_config: # target: cldm.cldm.ControlNet # params: # image_size: 32 # unused # in_channels: 100 # hint_channels: 768 # model_channels: 128 # attention_resolutions: [ 4, 2, 1 ] # num_res_blocks: 2 # channel_mult: [ 1, 2, 4, 4 ] # num_heads: 8 # use_spatial_transformer: True # transformer_depth: 1 # context_dim: 768 # use_checkpoint: True # legacy: False refer_config: target: cldm.cldm.ReferenceNet params: image_size: 32 # unused hint_in_channels: 1024 hint_out_channels: 128 in_channels: 100 out_channels: 100 model_channels: 1024 attention_resolutions: [ 4, 2, 1 ] num_res_blocks: 1 channel_mult: [ 1, 1 ] num_heads: 8 use_spatial_transformer: True transformer_depth: 1 context_dim: 512 use_checkpoint: True dims: 1 legacy: False unet_config: target: tortoise_model.DiffusionTts params: model_channels: 512 num_layers: 8 in_channels: 100 in_latent_channels: 1024 out_channels: 100 dropout: 0 use_fp16: False num_heads: 16 layer_drop: .1 unconditioned_percentage: .1 # target: cldm.cldm.ControlledUnetModel # params: # image_size: 32 # unused # hint_in_channels: 1024 # hint_out_channels: 128 # in_channels: 100 # out_channels: 100 # model_channels: 1024 # attention_resolutions: [ 4, 2, 1 ] # num_res_blocks: 1 # resblock_updown: True # channel_mult: [ 1, 1] # num_heads: 8 # use_spatial_transformer: True # transformer_depth: 1 # context_dim: 512 # use_checkpoint: True # dims: 1 # legacy: False cond_stage_config: target: cldm.cond_emb.CLIP params: embed_dim: 512 vision_cfg: layers: 6 width: 512 head_width: 64 mlp_ratio: 4.0 patch_dropout: 0.4 attentional_pool: False patch_size: 64 image_size: 1000 in_channels: 100 pool_type: 'tok' pos_embed_type: 'learnable' final_ln_after_pool: false train: train_batch_size : 32 gradient_accumulate_every : 1 train_lr : 0.0001 train_num_steps : 1000000 ema_update_every : 10 ema_decay : 0.995 adam_betas : [0.9, 0.99] save_and_sample_every : 1000 timesteps : 1000 sampling_timesteps : 1000 results_folder : "results" logs_folder : "ttts/AA_diffusion/logs" num_workers : 32 eps : 0.000000001 keep_ckpts : 3 all_in_mem : false dataset: path : "/home/hyc/tortoise_plus_zh/ttts/datasets/databaker_data.jsonl" gpt_path : "/home/hyc/tortoise_plus_zh/ttts/gpt/logs/2023-12-24-14-22-14/model-70.pt"