Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

checkpoint-3500/audio_proj/config.json +5 -0
checkpoint-3500/audio_proj/diffusion_pytorch_model.safetensors +3 -0
checkpoint-3500/diffusion_net/config.json +88 -0
checkpoint-3500/diffusion_net/diffusion_pytorch_model.safetensors +3 -0
checkpoint-3500/image_proj/config.json +5 -0
checkpoint-3500/image_proj/diffusion_pytorch_model.safetensors +3 -0
checkpoint-3500/optimizer.bin +3 -0
checkpoint-3500/random_states_0.pkl +3 -0
checkpoint-3500/reference_net/config.json +66 -0
checkpoint-3500/reference_net/diffusion_pytorch_model.safetensors +3 -0
checkpoint-3500/scheduler.bin +3 -0
config.yaml +55 -0

checkpoint-3500/audio_proj/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_class_name": "AudioProjModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo"
+}

checkpoint-3500/audio_proj/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc1c7d4583571f4648f5bf82b70246671d4576837154be38436327f3e8c0e8cc
+size 72930976

checkpoint-3500/diffusion_net/config.json ADDED Viewed

	@@ -0,0 +1,88 @@

+{
+  "_center_input_sample": false,
+  "_class_name": "UNet3DConditionModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo",
+  "_out_channels": 4,
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": 8,
+  "attention_type": "default",
+  "audio_attention_dim": 768,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock3D",
+    "CrossAttnDownBlock3D",
+    "CrossAttnDownBlock3D",
+    "DownBlock3D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "emo_drop_rate": 0.05,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock3DCrossAttn",
+  "motion_module_kwargs": {
+    "attention_block_types": [
+      "Temporal_Self",
+      "Temporal_Self"
+    ],
+    "num_attention_heads": 8,
+    "num_transformer_block": 1,
+    "temporal_attention_dim_div": 1,
+    "temporal_position_encoding": true,
+    "temporal_position_encoding_max_len": 32
+  },
+  "motion_module_resolutions": [
+    1,
+    2,
+    4,
+    8
+  ],
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": 64,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": 1,
+  "unet_use_cross_frame_attention": false,
+  "unet_use_temporal_attention": false,
+  "up_block_types": [
+    "UpBlock3D",
+    "CrossAttnUpBlock3D",
+    "CrossAttnUpBlock3D",
+    "CrossAttnUpBlock3D"
+  ],
+  "upcast_attention": false,
+  "use_inflated_groupnorm": true,
+  "use_linear_projection": false
+}

checkpoint-3500/diffusion_net/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7480c7dbc5a9335c0f5c1bb7d484707eb1decf56dd3af792616772b34c873e8
+size 3356369240

checkpoint-3500/image_proj/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_class_name": "ImageProjModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo"
+}

checkpoint-3500/image_proj/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9855dabf683d8c17b265567737bc90e5a0b032db7551b78f028cd9b03439abe5
+size 3155280

checkpoint-3500/optimizer.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ab99fc1d13fefadc92ba2b7ef9202477481d9cf7a2e2b11c469e9f63793aea2
+size 3270385552

checkpoint-3500/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68e5025ebacc81710092a492731510b40a86ecae6af1087b30215e8b7b289969
+size 14408

checkpoint-3500/reference_net/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "_center_input_sample": false,
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo",
+  "_out_channels": 4,
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": 8,
+  "attention_type": "default",
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": 64,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": 1,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "upcast_attention": false,
+  "use_linear_projection": false
+}

checkpoint-3500/reference_net/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0550a76a5df4c295610040f90075b3bbf64f7d6be58cfc93a9722be1377b2486
+size 1714214152

checkpoint-3500/scheduler.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6802221f6347629ca3a6e377a3bbcf1d2a6c0b7ef7e83794854e08587237ffd4
+size 1000

config.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+tracker_project_name: memo
+output_dir: outputs/finetune
+resume_from_checkpoint: null
+model_name_or_path: memoavatar/memo
+vae: stabilityai/sd-vae-ft-mse
+gradient_checkpointing: true
+gradient_accumulation_steps: 1
+train_batch_size: 1
+max_train_steps: 3500
+num_train_epochs: -1
+enable_xformers_memory_efficient_attention: true
+checkpoints_total_limit: 20
+robust_training: true
+learning_rate: 1.0e-05
+max_grad_norm: 1.0
+scale_lr: false
+lr_scheduler: constant
+lr_warmup_steps: 0
+seed: 42
+mixed_precision: bf16
+use_8bit_adam: false
+allow_tf32: true
+use_ema: false
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_weight_decay: 0.01
+adam_epsilon: 1.0e-08
+dataloader_num_workers: 16
+prefetch_factor: 4
+checkpointing_steps: 5000
+data:
+  width: 512
+  height: 512
+  num_past_frames: 16
+  dynamic_past_frames: false
+  n_sample_frames: 16
+  audio_margin: 2
+  metadata_paths:
+  - data/embedding/metadata.jsonl
+weighting_scheme: logit_normal
+logit_mean: 0.0
+logit_std: 1.0
+mode_scale: 1.29
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+train_reference_net: false
+train_diffusion_net: true
+train_image_proj: false
+train_audio_proj: false
+trainable_modules:
+- motion_modules
+- audio_modules
+uncond_img_ratio: 0.05
+uncond_audio_ratio: 0.05
+start_ratio: 0.05