rs545837 commited on 19 days ago

Commit

bb2ffb3

verified ·

1 Parent(s): 1a1ad5d

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.ipynb_checkpoints/config-checkpoint.yaml +56 -0
checkpoint-1500/audio_proj/config.json +5 -0
checkpoint-1500/audio_proj/diffusion_pytorch_model.safetensors +3 -0
checkpoint-1500/diffusion_net/config.json +88 -0
checkpoint-3500/audio_proj/config.json +5 -0
checkpoint-3500/audio_proj/diffusion_pytorch_model.safetensors +3 -0
checkpoint-3500/diffusion_net/config.json +88 -0
checkpoint-3500/diffusion_net/diffusion_pytorch_model.safetensors +3 -0
checkpoint-3500/image_proj/config.json +5 -0
checkpoint-3500/image_proj/diffusion_pytorch_model.safetensors +3 -0
checkpoint-3500/optimizer.bin +3 -0
checkpoint-3500/random_states_0.pkl +3 -0
checkpoint-3500/reference_net/config.json +66 -0
checkpoint-3500/reference_net/diffusion_pytorch_model.safetensors +3 -0
checkpoint-3500/scheduler.bin +3 -0
checkpoint-5000/audio_proj/config.json +5 -0
checkpoint-5000/audio_proj/diffusion_pytorch_model.safetensors +3 -0
checkpoint-5000/diffusion_net/config.json +88 -0
checkpoint-5000/diffusion_net/diffusion_pytorch_model.safetensors +3 -0
checkpoint-5000/image_proj/config.json +5 -0
checkpoint-5000/image_proj/diffusion_pytorch_model.safetensors +3 -0
checkpoint-5000/optimizer.bin +3 -0
checkpoint-5000/random_states_0.pkl +3 -0
checkpoint-5000/reference_net/config.json +66 -0
checkpoint-5000/reference_net/diffusion_pytorch_model.safetensors +3 -0
checkpoint-5000/scheduler.bin +3 -0
checkpoint-7000/audio_proj/config.json +5 -0
checkpoint-7000/audio_proj/diffusion_pytorch_model.safetensors +3 -0
checkpoint-7000/diffusion_net/config.json +88 -0
checkpoint-7000/diffusion_net/diffusion_pytorch_model.safetensors +3 -0
checkpoint-7000/image_proj/config.json +5 -0
checkpoint-7000/image_proj/diffusion_pytorch_model.safetensors +3 -0
checkpoint-7000/optimizer.bin +3 -0
checkpoint-7000/random_states_0.pkl +3 -0
checkpoint-7000/reference_net/config.json +66 -0
checkpoint-7000/reference_net/diffusion_pytorch_model.safetensors +3 -0
checkpoint-7000/scheduler.bin +3 -0
config.yaml +56 -0
logs/memo/1741022678.2856228/events.out.tfevents.1741022678.3c52c6a06e3b +3 -0
logs/memo/1741022678.2878249/hparams.yml +55 -0
logs/memo/1741046967.6854546/events.out.tfevents.1741046967.369f90aa7051 +3 -0
logs/memo/1741046967.688156/.ipynb_checkpoints/hparams-checkpoint.yml +55 -0
logs/memo/1741046967.688156/hparams.yml +55 -0
logs/memo/1741060237.980531/events.out.tfevents.1741060237.369f90aa7051 +3 -0
logs/memo/1741060237.983221/hparams.yml +55 -0
logs/memo/1741060310.7917793/events.out.tfevents.1741060310.369f90aa7051 +3 -0
logs/memo/1741060310.7946274/hparams.yml +55 -0
logs/memo/1741060338.6906004/events.out.tfevents.1741060338.369f90aa7051 +3 -0
logs/memo/1741060338.6932552/hparams.yml +55 -0
logs/memo/1741145698.900579/events.out.tfevents.1741145698.16e2a27e51cb +3 -0

.ipynb_checkpoints/config-checkpoint.yaml ADDED Viewed

	@@ -0,0 +1,56 @@

+tracker_project_name: memo
+output_dir: outputs
+resume_from_checkpoint: null
+model_name_or_path: memoavatar/memo
+vae: stabilityai/sd-vae-ft-mse
+gradient_checkpointing: true
+gradient_accumulation_steps: 1
+train_batch_size: 2
+max_train_steps: 3500
+num_train_epochs: -1
+enable_xformers_memory_efficient_attention: true
+checkpoints_total_limit: 3
+robust_training: true
+learning_rate: 1e-5
+max_grad_norm: 1.0
+scale_lr: false
+lr_scheduler: constant
+lr_warmup_steps: 0
+seed: 42
+mixed_precision: bf16
+use_8bit_adam: false
+allow_tf32: true
+use_ema: false
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_weight_decay: 0.01
+adam_epsilon: 1e-08
+dataloader_num_workers: 16
+prefetch_factor: 4
+checkpointing_steps: 5000
+data:
+  width: 512
+  height: 512
+  num_past_frames: 16
+  dynamic_past_frames: false
+  n_sample_frames: 16
+  audio_margin: 2
+  metadata_paths:
+    - assets/embedding/metadata.jsonl
+weighting_scheme: logit_normal
+logit_mean: 0.0
+logit_std: 1.0
+mode_scale: 1.29
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+train_reference_net: true
+train_diffusion_net: true
+train_image_proj: true
+train_audio_proj: true
+trainable_modules:
+  - to_q
+  - to_k
+  - to_v
+uncond_img_ratio: 0.05
+uncond_audio_ratio: 0.05
+start_ratio: 0.05

checkpoint-1500/audio_proj/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_class_name": "AudioProjModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo"
+}

checkpoint-1500/audio_proj/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc1c7d4583571f4648f5bf82b70246671d4576837154be38436327f3e8c0e8cc
+size 72930976

checkpoint-1500/diffusion_net/config.json ADDED Viewed

	@@ -0,0 +1,88 @@

+{
+  "_center_input_sample": false,
+  "_class_name": "UNet3DConditionModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo",
+  "_out_channels": 4,
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": 8,
+  "attention_type": "default",
+  "audio_attention_dim": 768,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock3D",
+    "CrossAttnDownBlock3D",
+    "CrossAttnDownBlock3D",
+    "DownBlock3D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "emo_drop_rate": 0.05,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock3DCrossAttn",
+  "motion_module_kwargs": {
+    "attention_block_types": [
+      "Temporal_Self",
+      "Temporal_Self"
+    ],
+    "num_attention_heads": 8,
+    "num_transformer_block": 1,
+    "temporal_attention_dim_div": 1,
+    "temporal_position_encoding": true,
+    "temporal_position_encoding_max_len": 32
+  },
+  "motion_module_resolutions": [
+    1,
+    2,
+    4,
+    8
+  ],
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": 64,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": 1,
+  "unet_use_cross_frame_attention": false,
+  "unet_use_temporal_attention": false,
+  "up_block_types": [
+    "UpBlock3D",
+    "CrossAttnUpBlock3D",
+    "CrossAttnUpBlock3D",
+    "CrossAttnUpBlock3D"
+  ],
+  "upcast_attention": false,
+  "use_inflated_groupnorm": true,
+  "use_linear_projection": false
+}

checkpoint-3500/audio_proj/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_class_name": "AudioProjModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo"
+}

checkpoint-3500/audio_proj/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d58084a9b3eaa79f6748560ca9cce7595931cc6310517f94fec1ae78712cca4
+size 72930976

checkpoint-3500/diffusion_net/config.json ADDED Viewed

	@@ -0,0 +1,88 @@

+{
+  "_center_input_sample": false,
+  "_class_name": "UNet3DConditionModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo",
+  "_out_channels": 4,
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": 8,
+  "attention_type": "default",
+  "audio_attention_dim": 768,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock3D",
+    "CrossAttnDownBlock3D",
+    "CrossAttnDownBlock3D",
+    "DownBlock3D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "emo_drop_rate": 0.05,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock3DCrossAttn",
+  "motion_module_kwargs": {
+    "attention_block_types": [
+      "Temporal_Self",
+      "Temporal_Self"
+    ],
+    "num_attention_heads": 8,
+    "num_transformer_block": 1,
+    "temporal_attention_dim_div": 1,
+    "temporal_position_encoding": true,
+    "temporal_position_encoding_max_len": 32
+  },
+  "motion_module_resolutions": [
+    1,
+    2,
+    4,
+    8
+  ],
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": 64,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": 1,
+  "unet_use_cross_frame_attention": false,
+  "unet_use_temporal_attention": false,
+  "up_block_types": [
+    "UpBlock3D",
+    "CrossAttnUpBlock3D",
+    "CrossAttnUpBlock3D",
+    "CrossAttnUpBlock3D"
+  ],
+  "upcast_attention": false,
+  "use_inflated_groupnorm": true,
+  "use_linear_projection": false
+}

checkpoint-3500/diffusion_net/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c49282fecedb44b61587af090b411d19c91cf02d1f931f3d09a200dafdf04e4
+size 3356369240

checkpoint-3500/image_proj/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_class_name": "ImageProjModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo"
+}

checkpoint-3500/image_proj/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da65bdfcdf6b4b7fe8fde646092c71fe592741cbcd2fcdb1bbf16ea35982790f
+size 3155280

checkpoint-3500/optimizer.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7a90617826926ee0ede2186b38412e05908ef7f83f446e3faa584d32dd37816
+size 1422326990

checkpoint-3500/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8143d967fa3d4e7d3eaa5ed4fb8f80dd746452184ededbfb33cfb2926728d23
+size 14408

checkpoint-3500/reference_net/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "_center_input_sample": false,
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo",
+  "_out_channels": 4,
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": 8,
+  "attention_type": "default",
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": 64,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": 1,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "upcast_attention": false,
+  "use_linear_projection": false
+}

checkpoint-3500/reference_net/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:345b2f774b724493a1f4873560ffd419b81d4324be80028d67625f27ebcab6a1
+size 1714214152

checkpoint-3500/scheduler.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6802221f6347629ca3a6e377a3bbcf1d2a6c0b7ef7e83794854e08587237ffd4
+size 1000

checkpoint-5000/audio_proj/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_class_name": "AudioProjModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo"
+}

checkpoint-5000/audio_proj/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc1c7d4583571f4648f5bf82b70246671d4576837154be38436327f3e8c0e8cc
+size 72930976

checkpoint-5000/diffusion_net/config.json ADDED Viewed

	@@ -0,0 +1,88 @@

+{
+  "_center_input_sample": false,
+  "_class_name": "UNet3DConditionModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo",
+  "_out_channels": 4,
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": 8,
+  "attention_type": "default",
+  "audio_attention_dim": 768,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock3D",
+    "CrossAttnDownBlock3D",
+    "CrossAttnDownBlock3D",
+    "DownBlock3D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "emo_drop_rate": 0.05,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock3DCrossAttn",
+  "motion_module_kwargs": {
+    "attention_block_types": [
+      "Temporal_Self",
+      "Temporal_Self"
+    ],
+    "num_attention_heads": 8,
+    "num_transformer_block": 1,
+    "temporal_attention_dim_div": 1,
+    "temporal_position_encoding": true,
+    "temporal_position_encoding_max_len": 32
+  },
+  "motion_module_resolutions": [
+    1,
+    2,
+    4,
+    8
+  ],
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": 64,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": 1,
+  "unet_use_cross_frame_attention": false,
+  "unet_use_temporal_attention": false,
+  "up_block_types": [
+    "UpBlock3D",
+    "CrossAttnUpBlock3D",
+    "CrossAttnUpBlock3D",
+    "CrossAttnUpBlock3D"
+  ],
+  "upcast_attention": false,
+  "use_inflated_groupnorm": true,
+  "use_linear_projection": false
+}

checkpoint-5000/diffusion_net/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53f04a95f121794e54554f6b8746a239354167a03c70c2d3e6829cabd517754f
+size 3356369240

checkpoint-5000/image_proj/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_class_name": "ImageProjModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo"
+}

checkpoint-5000/image_proj/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9855dabf683d8c17b265567737bc90e5a0b032db7551b78f028cd9b03439abe5
+size 3155280

checkpoint-5000/optimizer.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5932ca6ad987609976876f0e8473b54ee286a9e62778f4152c1de736a8b98c30
+size 3270385552

checkpoint-5000/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e0dc7aef135da1cc848c02bc449b185f15582060d87aa35e527e7d944cd090bd
+size 14344

checkpoint-5000/reference_net/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "_center_input_sample": false,
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo",
+  "_out_channels": 4,
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": 8,
+  "attention_type": "default",
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": 64,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": 1,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "upcast_attention": false,
+  "use_linear_projection": false
+}

checkpoint-5000/reference_net/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0550a76a5df4c295610040f90075b3bbf64f7d6be58cfc93a9722be1377b2486
+size 1714214152

checkpoint-5000/scheduler.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:920333155b08e76154b89a85170c320833074db7b84649a9cfe79fd10b6bf2d9
+size 1000

checkpoint-7000/audio_proj/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_class_name": "AudioProjModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo"
+}

checkpoint-7000/audio_proj/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc1c7d4583571f4648f5bf82b70246671d4576837154be38436327f3e8c0e8cc
+size 72930976

checkpoint-7000/diffusion_net/config.json ADDED Viewed

	@@ -0,0 +1,88 @@

+{
+  "_center_input_sample": false,
+  "_class_name": "UNet3DConditionModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo",
+  "_out_channels": 4,
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": 8,
+  "attention_type": "default",
+  "audio_attention_dim": 768,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock3D",
+    "CrossAttnDownBlock3D",
+    "CrossAttnDownBlock3D",
+    "DownBlock3D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "emo_drop_rate": 0.05,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock3DCrossAttn",
+  "motion_module_kwargs": {
+    "attention_block_types": [
+      "Temporal_Self",
+      "Temporal_Self"
+    ],
+    "num_attention_heads": 8,
+    "num_transformer_block": 1,
+    "temporal_attention_dim_div": 1,
+    "temporal_position_encoding": true,
+    "temporal_position_encoding_max_len": 32
+  },
+  "motion_module_resolutions": [
+    1,
+    2,
+    4,
+    8
+  ],
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": 64,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": 1,
+  "unet_use_cross_frame_attention": false,
+  "unet_use_temporal_attention": false,
+  "up_block_types": [
+    "UpBlock3D",
+    "CrossAttnUpBlock3D",
+    "CrossAttnUpBlock3D",
+    "CrossAttnUpBlock3D"
+  ],
+  "upcast_attention": false,
+  "use_inflated_groupnorm": true,
+  "use_linear_projection": false
+}

checkpoint-7000/diffusion_net/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e24b052422aa8c6dc6b0ed81d03492b83928fc0c25d437cecfda6c65b52348c1
+size 3356369240

checkpoint-7000/image_proj/config.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "_class_name": "ImageProjModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo"
+}

checkpoint-7000/image_proj/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9855dabf683d8c17b265567737bc90e5a0b032db7551b78f028cd9b03439abe5
+size 3155280

checkpoint-7000/optimizer.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e6bee6eb8cfb1c8bd3314731385820e5ee5dbb0bd57dbb1a55c1ba676f34c2a
+size 3270385552

checkpoint-7000/random_states_0.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1bc142465f2d168b5cbd99f93129ea9d61c76a52d868f8d39dbbaed3581a6afd
+size 14344

checkpoint-7000/reference_net/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "_center_input_sample": false,
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.31.0",
+  "_name_or_path": "memoavatar/memo",
+  "_out_channels": 4,
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": 8,
+  "attention_type": "default",
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "projection_class_embeddings_input_dim": null,
+  "resnet_time_scale_shift": "default",
+  "reverse_transformer_layers_per_block": null,
+  "sample_size": 64,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "transformer_layers_per_block": 1,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "upcast_attention": false,
+  "use_linear_projection": false
+}

checkpoint-7000/reference_net/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0550a76a5df4c295610040f90075b3bbf64f7d6be58cfc93a9722be1377b2486
+size 1714214152

checkpoint-7000/scheduler.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4f80bd4cf789dadaabab339ee4bbcf7677008e876201ba9229aae4c3d8860b1
+size 1000

config.yaml ADDED Viewed

	@@ -0,0 +1,56 @@

+tracker_project_name: memo
+output_dir: outputs
+resume_from_checkpoint: null
+model_name_or_path: memoavatar/memo
+vae: stabilityai/sd-vae-ft-mse
+gradient_checkpointing: true
+gradient_accumulation_steps: 1
+train_batch_size: 2
+max_train_steps: 3500
+num_train_epochs: -1
+enable_xformers_memory_efficient_attention: true
+checkpoints_total_limit: 3
+robust_training: true
+learning_rate: 1e-5
+max_grad_norm: 1.0
+scale_lr: false
+lr_scheduler: constant
+lr_warmup_steps: 0
+seed: 42
+mixed_precision: bf16
+use_8bit_adam: false
+allow_tf32: true
+use_ema: false
+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_weight_decay: 0.01
+adam_epsilon: 1e-08
+dataloader_num_workers: 16
+prefetch_factor: 4
+checkpointing_steps: 5000
+data:
+  width: 512
+  height: 512
+  num_past_frames: 16
+  dynamic_past_frames: false
+  n_sample_frames: 16
+  audio_margin: 2
+  metadata_paths:
+    - assets/embedding/metadata.jsonl
+weighting_scheme: logit_normal
+logit_mean: 0.0
+logit_std: 1.0
+mode_scale: 1.29
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+train_reference_net: true
+train_diffusion_net: true
+train_image_proj: true
+train_audio_proj: true
+trainable_modules:
+  - to_q
+  - to_k
+  - to_v
+uncond_img_ratio: 0.05
+uncond_audio_ratio: 0.05
+start_ratio: 0.05

logs/memo/1741022678.2856228/events.out.tfevents.1741022678.3c52c6a06e3b ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82468bdd6d38c969094750adaf19ce3778a9c50fd26b45c2e643f9852bcd80fd
+size 2329

logs/memo/1741022678.2878249/hparams.yml ADDED Viewed

	@@ -0,0 +1,55 @@

+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+adam_weight_decay: 0.01
+allow_tf32: true
+checkpointing_steps: 5000
+checkpoints_total_limit: 20
+data:
+  audio_margin: 2
+  dynamic_past_frames: false
+  height: 512
+  metadata_paths:
+  - data/embedding/metadata.jsonl
+  n_sample_frames: 16
+  num_past_frames: 16
+  width: 512
+dataloader_num_workers: 16
+enable_xformers_memory_efficient_attention: true
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+learning_rate: 1.0e-05
+logit_mean: 0.0
+logit_std: 1.0
+lr_scheduler: constant
+lr_warmup_steps: 0
+max_grad_norm: 1.0
+max_train_steps: 3500
+mixed_precision: bf16
+mode_scale: 1.29
+model_name_or_path: memoavatar/memo
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+num_train_epochs: 350
+output_dir: outputs/finetune
+prefetch_factor: 4
+resume_from_checkpoint: null
+robust_training: true
+scale_lr: false
+seed: 42
+start_ratio: 0.05
+tracker_project_name: memo
+train_audio_proj: false
+train_batch_size: 1
+train_diffusion_net: true
+train_image_proj: false
+train_reference_net: false
+trainable_modules:
+- motion_modules
+- audio_modules
+uncond_audio_ratio: 0.05
+uncond_img_ratio: 0.05
+use_8bit_adam: false
+use_ema: false
+vae: stabilityai/sd-vae-ft-mse
+weighting_scheme: logit_normal

logs/memo/1741046967.6854546/events.out.tfevents.1741046967.369f90aa7051 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:516216cf24c6ea13cf020b0b334970a9d7dcaef079db3a8fd0d3cdbc21aa1b58
+size 2329

logs/memo/1741046967.688156/.ipynb_checkpoints/hparams-checkpoint.yml ADDED Viewed

	@@ -0,0 +1,55 @@

+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+adam_weight_decay: 0.01
+allow_tf32: true
+checkpointing_steps: 5000
+checkpoints_total_limit: 20
+data:
+  audio_margin: 2
+  dynamic_past_frames: false
+  height: 512
+  metadata_paths:
+  - data/embedding/metadata.jsonl
+  n_sample_frames: 16
+  num_past_frames: 16
+  width: 512
+dataloader_num_workers: 16
+enable_xformers_memory_efficient_attention: true
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+learning_rate: 1.0e-05
+logit_mean: 0.0
+logit_std: 1.0
+lr_scheduler: constant
+lr_warmup_steps: 0
+max_grad_norm: 1.0
+max_train_steps: 3500
+mixed_precision: bf16
+mode_scale: 1.29
+model_name_or_path: memoavatar/memo
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+num_train_epochs: 700
+output_dir: outputs/finetune
+prefetch_factor: 4
+resume_from_checkpoint: null
+robust_training: true
+scale_lr: false
+seed: 42
+start_ratio: 0.05
+tracker_project_name: memo
+train_audio_proj: false
+train_batch_size: 2
+train_diffusion_net: true
+train_image_proj: false
+train_reference_net: false
+trainable_modules:
+- motion_modules
+- audio_modules
+uncond_audio_ratio: 0.05
+uncond_img_ratio: 0.05
+use_8bit_adam: true
+use_ema: false
+vae: stabilityai/sd-vae-ft-mse
+weighting_scheme: logit_normal

logs/memo/1741046967.688156/hparams.yml ADDED Viewed

	@@ -0,0 +1,55 @@

+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+adam_weight_decay: 0.01
+allow_tf32: true
+checkpointing_steps: 5000
+checkpoints_total_limit: 20
+data:
+  audio_margin: 2
+  dynamic_past_frames: false
+  height: 512
+  metadata_paths:
+  - data/embedding/metadata.jsonl
+  n_sample_frames: 16
+  num_past_frames: 16
+  width: 512
+dataloader_num_workers: 16
+enable_xformers_memory_efficient_attention: true
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+learning_rate: 1.0e-05
+logit_mean: 0.0
+logit_std: 1.0
+lr_scheduler: constant
+lr_warmup_steps: 0
+max_grad_norm: 1.0
+max_train_steps: 3500
+mixed_precision: bf16
+mode_scale: 1.29
+model_name_or_path: memoavatar/memo
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+num_train_epochs: 700
+output_dir: outputs/finetune
+prefetch_factor: 4
+resume_from_checkpoint: null
+robust_training: true
+scale_lr: false
+seed: 42
+start_ratio: 0.05
+tracker_project_name: memo
+train_audio_proj: false
+train_batch_size: 2
+train_diffusion_net: true
+train_image_proj: false
+train_reference_net: false
+trainable_modules:
+- motion_modules
+- audio_modules
+uncond_audio_ratio: 0.05
+uncond_img_ratio: 0.05
+use_8bit_adam: true
+use_ema: false
+vae: stabilityai/sd-vae-ft-mse
+weighting_scheme: logit_normal

logs/memo/1741060237.980531/events.out.tfevents.1741060237.369f90aa7051 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:657998ec24fdf97d92613eada6193a47b49c50fb100f3b8f5ec4e25dd8d11249
+size 2329

logs/memo/1741060237.983221/hparams.yml ADDED Viewed

	@@ -0,0 +1,55 @@

+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+adam_weight_decay: 0.01
+allow_tf32: true
+checkpointing_steps: 5000
+checkpoints_total_limit: 20
+data:
+  audio_margin: 2
+  dynamic_past_frames: false
+  height: 512
+  metadata_paths:
+  - data/embedding/metadata.jsonl
+  n_sample_frames: 16
+  num_past_frames: 16
+  width: 512
+dataloader_num_workers: 16
+enable_xformers_memory_efficient_attention: true
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+learning_rate: 1.0e-05
+logit_mean: 0.0
+logit_std: 1.0
+lr_scheduler: constant
+lr_warmup_steps: 0
+max_grad_norm: 1.0
+max_train_steps: 3500
+mixed_precision: bf16
+mode_scale: 1.29
+model_name_or_path: memoavatar/memo
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+num_train_epochs: 700
+output_dir: outputs/finetune
+prefetch_factor: 4
+resume_from_checkpoint: null
+robust_training: true
+scale_lr: false
+seed: 42
+start_ratio: 0.05
+tracker_project_name: memo
+train_audio_proj: false
+train_batch_size: 2
+train_diffusion_net: true
+train_image_proj: false
+train_reference_net: false
+trainable_modules:
+- motion_modules
+- audio_modules
+uncond_audio_ratio: 0.05
+uncond_img_ratio: 0.05
+use_8bit_adam: true
+use_ema: false
+vae: stabilityai/sd-vae-ft-mse
+weighting_scheme: logit_normal

logs/memo/1741060310.7917793/events.out.tfevents.1741060310.369f90aa7051 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7ab603a92f4ae6f27f318a54ebff660a0c9e707799e389f65f2440b9eb176a2
+size 2329

logs/memo/1741060310.7946274/hparams.yml ADDED Viewed

	@@ -0,0 +1,55 @@

+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+adam_weight_decay: 0.01
+allow_tf32: true
+checkpointing_steps: 5000
+checkpoints_total_limit: 20
+data:
+  audio_margin: 2
+  dynamic_past_frames: false
+  height: 512
+  metadata_paths:
+  - data/embedding/metadata.jsonl
+  n_sample_frames: 16
+  num_past_frames: 16
+  width: 512
+dataloader_num_workers: 16
+enable_xformers_memory_efficient_attention: true
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+learning_rate: 1.0e-05
+logit_mean: 0.0
+logit_std: 1.0
+lr_scheduler: constant
+lr_warmup_steps: 0
+max_grad_norm: 1.0
+max_train_steps: 3500
+mixed_precision: bf16
+mode_scale: 1.29
+model_name_or_path: memoavatar/memo
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+num_train_epochs: 875
+output_dir: outputs/finetune
+prefetch_factor: 4
+resume_from_checkpoint: null
+robust_training: true
+scale_lr: false
+seed: 42
+start_ratio: 0.05
+tracker_project_name: memo
+train_audio_proj: false
+train_batch_size: 3
+train_diffusion_net: true
+train_image_proj: false
+train_reference_net: false
+trainable_modules:
+- motion_modules
+- audio_modules
+uncond_audio_ratio: 0.05
+uncond_img_ratio: 0.05
+use_8bit_adam: true
+use_ema: false
+vae: stabilityai/sd-vae-ft-mse
+weighting_scheme: logit_normal

logs/memo/1741060338.6906004/events.out.tfevents.1741060338.369f90aa7051 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5e705e83cd16356a16b87991f60c4e05562b9765a3ef319ff245e211feaf078
+size 2329

logs/memo/1741060338.6932552/hparams.yml ADDED Viewed

	@@ -0,0 +1,55 @@

+adam_beta1: 0.9
+adam_beta2: 0.999
+adam_epsilon: 1.0e-08
+adam_weight_decay: 0.01
+allow_tf32: true
+checkpointing_steps: 5000
+checkpoints_total_limit: 20
+data:
+  audio_margin: 2
+  dynamic_past_frames: false
+  height: 512
+  metadata_paths:
+  - data/embedding/metadata.jsonl
+  n_sample_frames: 16
+  num_past_frames: 16
+  width: 512
+dataloader_num_workers: 16
+enable_xformers_memory_efficient_attention: true
+gradient_accumulation_steps: 1
+gradient_checkpointing: true
+learning_rate: 1.0e-05
+logit_mean: 0.0
+logit_std: 1.0
+lr_scheduler: constant
+lr_warmup_steps: 0
+max_grad_norm: 1.0
+max_train_steps: 3500
+mixed_precision: bf16
+mode_scale: 1.29
+model_name_or_path: memoavatar/memo
+noise_scheduler_kwargs:
+  num_train_timesteps: 1000
+num_train_epochs: 350
+output_dir: outputs/finetune
+prefetch_factor: 4
+resume_from_checkpoint: null
+robust_training: true
+scale_lr: false
+seed: 42
+start_ratio: 0.05
+tracker_project_name: memo
+train_audio_proj: false
+train_batch_size: 1
+train_diffusion_net: true
+train_image_proj: false
+train_reference_net: false
+trainable_modules:
+- motion_modules
+- audio_modules
+uncond_audio_ratio: 0.05
+uncond_img_ratio: 0.05
+use_8bit_adam: true
+use_ema: false
+vae: stabilityai/sd-vae-ft-mse
+weighting_scheme: logit_normal

logs/memo/1741145698.900579/events.out.tfevents.1741145698.16e2a27e51cb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ce7a3b494bd3470fb3b8c6543ac40d7612c5c71a2c1290c5b5fd711fd79ec93
+size 2329